我們還是一cameraserver進程爲例,看它是如何被系統加載運行起來的。
frameworks/av/camera/cameraserver/cameraserver.rc
1 service cameraserver /system/bin/cameraserver
2 class main
3 user cameraserver
4 group audio camera input drmrpc
5 ioprio rt 4
6 writepid /dev/cpuset/camera-daemon/tasks /dev/stune/top-app/tasks
從cameraserver.rc我們可以知道cameraserver 這個service對應的程序文件爲/system/bin/cameraserver,在init進程解析完對應的rc文件後,會把這個進程運行起來,解析過程就不詳述。
system/core/init/builtins.cpp
261 static int do_exec(const std::vector<std::string>& args) {
262 Service* svc = ServiceManager::GetInstance().MakeExecOneshotService(args);
263 if (!svc) {
264 return -1;
265 }
266 if (!svc->Start()) {
267 return -1;
268 }
269 waiting_for_exec = true;
270 return 0;
271 }
init進程會調用do_exec()函數就是把service對應的進程運行起來,這個函數在不同版本的android源碼中,裏面的代碼有些不一樣。system/core/init/service.cpp
318 bool Service::Start() {
...
388 NOTICE("Starting service '%s'...\n", name_.c_str());
389
390 pid_t pid = fork(); //創建出一個新進程
391 if (pid == 0) { //如果是子進程,會走進這個條件
392 umask(077);
...
476 if (execve(strs[0], (char**) &strs[0], (char**) ENV) < 0) { //加載進程
477 ERROR("cannot execve('%s'): %s\n", strs[0], strerror(errno));
478 }
479
480 _exit(127);
481 }
482
483 if (pid < 0) { //如果fork 進程失敗
484 ERROR("failed to start '%s'\n", name_.c_str());
485 pid_ = 0;
486 return false;
487 }
488 //父進程,也就是init進程會跑下面的代碼
489 time_started_ = gettime();
490 pid_ = pid;
...
499 NotifyStateChange("running");
500 return true;
501 }
到這裏基本上就很清晰了,init啓動service進程主要是通過fork()和execve()實現的,所以下面我們重點跟蹤和了解這兩個函數的實現。
1、fork創建子進程
fork()函數聲明在 bionic/libc/include/unistd.h…
81 extern pid_t fork(void);
82 extern pid_t vfork(void);
…
fork()實現在 bionic/libc/bionic/fork.cpp
34 #define FORK_FLAGS (CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD)
36 int fork() {
37 __bionic_atfork_run_prepare();
38
39 pthread_internal_t* self = __get_thread();
40
41 // Remember the parent pid and invalidate the cached value while we fork.
42 pid_t parent_pid = self->invalidate_cached_pid();
43
44 #if defined(__x86_64__) // sys_clone's last two arguments are flipped on x86-64.
45 int result = syscall(__NR_clone, FORK_FLAGS, NULL, NULL, &(self->tid), NULL);
46 #else
47 int result = syscall(__NR_clone, FORK_FLAGS, NULL, NULL, NULL, &(self->tid));
48 #endif
49 if (result == 0) {
50 self->set_cached_pid(gettid());
51 __bionic_atfork_run_child();
52 } else {
53 self->set_cached_pid(parent_pid);
54 __bionic_atfork_run_parent();
55 }
56 return result;
57 }
裏面又通過syscall()系統調用函數來創建進程,注意傳入的參數是__NR_clone和FORK_FLAGS。
syscall在 arm64的實現 bionic/libc/arch-arm64/bionic/syscall.S
29 #include <private/bionic_asm.h>
30
31 ENTRY(syscall)
32 /* Move syscall No. from x0 to x8 */
33 mov x8, x0
34 /* Move syscall parameters from x1 thru x6 to x0 thru x5 */
35 mov x0, x1
36 mov x1, x2
37 mov x2, x3
38 mov x3, x4
39 mov x4, x5
40 mov x5, x6
41 svc #0 //可以認爲執行了svc指令後,會到kernel中斷相應函數
42
43 /* check if syscall returned successfully */
44 cmn x0, #(MAX_ERRNO + 1) //x0 - MAX_ERRNO -1
45 cneg x0, x0, hi //
46 b.hi __set_errno_internal
47
48 ret //調用返回
49 END(syscall)
kernel對應的處理流程
linux-4.10/arch/arm64/kernel/entry.S
…
328 ventry el0_sync // Synchronous 64-bit EL0
329 ventry el0_irq // IRQ 64-bit EL0
330 ventry el0_fiq_invalid // FIQ 64-bit EL0
331 ventry el0_error_invalid // Error 64-bit EL0
…
我們繼續看el0_sync:的實現
511 /*
512 * EL0 mode handlers.
513 */
514 .align 6
515 el0_sync:
516 kernel_entry 0 //kernel_entry 很重要,後面會講
517 mrs x25, esr_el1 // read the syndrome register
518 lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
519 cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state //比較兩個值是否相等
520 b.eq el0_svc //如果上面比較的結果相等,就跳轉到el0_svc
...
接着看el0_sev裏面的處理/*
795 * SVC handler.
796 */
797 .align 6
798 el0_svc:
799 adrp stbl, sys_call_table // load syscall table pointer //處理函數表格地址
//在syscall.S中通過指令 mov x8, x0 將__NR_clone 參數保存到x8,下面的w8 也就是x8
800 uxtw scno, w8 // syscall number in w8
801 mov sc_nr, #__NR_syscalls //系統調用number的最大值
802 el0_svc_naked: // compat entry point
803 stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number
804 enable_dbg_and_irq
805 ct_user_exit 1
806
807 ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks
808 tst x16, #_TIF_SYSCALL_WORK
809 b.ne __sys_trace
810 cmp scno, sc_nr // check upper syscall limit
811 b.hs ni_sys
812 ldr x16, [stbl, scno, lsl #3] // address in the syscall table
813 blr x16 // call sys_* routine
814 b ret_fast_syscall
815 ni_sys:
816 mov x0, sp
817 bl do_ni_syscall
818 b ret_fast_syscall
819 ENDPROC(el0_svc)
我們重點關注下面這兩條指令
812 ldr x16,[stbl, scno, lsl #3] // address in thesyscall table
813 blr x16 // call sys_*routine
LDR R0,[R1,R2,LSL #3] 將存儲器地址爲R1+R2*8的字數據讀入寄存器R0。
blr Xm:跳轉到由Xm目標寄存器指定的地址處執行
stbl 定義在linux-4.10/arch/arm64/kernel/sys.c
55 #undef __SYSCALL
56 #define __SYSCALL(nr, sym) [nr] = sym,
57
58 /*
59 * The sys_call_table array must be 4K aligned to be accessible from
60 * kernel/entry.S.
61 */
62 void * const sys_call_table[__NR_syscalls] __aligned(4096) = {
63 [0 ... __NR_syscalls - 1] = sys_ni_syscall,
64 #include <asm/unistd.h>
65 };
linux-4.10/include/uapi/asm-generic/unistd.h
604 /* arch/example/kernel/sys_example.c */
605 #define __NR_clone 220
606 __SYSCALL(__NR_clone, sys_clone)
607 #define __NR_execve 221
608 __SC_COMP(__NR_execve, sys_execve, compat_sys_execve)
sys_call_table是一個保存函數地址的數組,所以下面的兩行代碼可以認爲是取出數組下標對應的處理函數,然後跳到函數執行,就像函數指針一樣的調用。812 ldr x16,[stbl, scno, lsl #3] // address in thesyscall table
813 blr x16 // call sys_*routine
所以緊接着調用到sys_clone()函數,sys_clone 對應的實現在fork.c中
宏定義在linux-4.10/include/linux/syscalls.h178 #define SYSCALL_DEFINE0(sname) \
179 SYSCALL_METADATA(_##sname, 0); \
180 asmlinkage long sys_##sname(void)
181
182 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
183 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
184 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
185 #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
186 #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
187 #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
189 #define SYSCALL_DEFINEx(x, sname, ...) \
190 SYSCALL_METADATA(sname, x, __VA_ARGS__) \
191 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
192
193 #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
194 #define __SYSCALL_DEFINEx(x, name, ...) \
195 asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
196 __attribute__((alias(__stringify(SyS##name)))); \
197 static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
198 asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
199 asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
200 { \
201 long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
202 __MAP(x,__SC_TEST,__VA_ARGS__); \
203 __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
204 return ret; \
205 } \
206 static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
linux-4.10/kernel/fork.c
/*
2000 * Create a kernel thread.
2001 */
2002 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2003 {
2004 return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
2005 (unsigned long)arg, NULL, NULL, 0);
2006 }
2007
2008 #ifdef __ARCH_WANT_SYS_FORK
2009 SYSCALL_DEFINE0(fork)
2010 {
2011 #ifdef CONFIG_MMU
2012 return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
2013 #else
2014 /* can not support in nommu mode */
2015 return -EINVAL;
2016 #endif
2017 }
2018 #endif
2019
2020 #ifdef __ARCH_WANT_SYS_VFORK
2021 SYSCALL_DEFINE0(vfork)
2022 {
2023 return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
2024 0, NULL, NULL, 0);
2025 }
2026 #endif
2027
2028 #ifdef __ARCH_WANT_SYS_CLONE
2029 #ifdef CONFIG_CLONE_BACKWARDS
2030 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2031 int __user *, parent_tidptr,
2032 unsigned long, tls,
2033 int __user *, child_tidptr)
2034 #elif defined(CONFIG_CLONE_BACKWARDS2)
2035 SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2036 int __user *, parent_tidptr,
2037 int __user *, child_tidptr,
2038 unsigned long, tls)
2039 #elif defined(CONFIG_CLONE_BACKWARDS3)
2040 SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2041 int, stack_size,
2042 int __user *, parent_tidptr,
2043 int __user *, child_tidptr,
2044 unsigned long, tls)
2045 #else
2046 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2047 int __user *, parent_tidptr,
2048 int __user *, child_tidptr,
2049 unsigned long, tls)
2050 #endif
2051 {
2052 return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
2053 }
2054 #endif
所以上面貼出了一大推東西,最終都是到_do_fork()裏面實現,通過上面的代碼我們還知道kernel_thread(),fork(),vfork(),clone()都會調用到_do_fork(),只不過傳入的參數不一樣而已。所以創建進程在kernel中都會調用到_do_fork(),所以以後在程序中看到fork(),vfork(),clone()這些接口,可以忽略中間的調用過程,直接看_do_fork()就可以了,只需要注意傳進來的參數就可以了。
1913 long _do_fork(unsigned long clone_flags,
1914 unsigned long stack_start,
1915 unsigned long stack_size,
1916 int __user *parent_tidptr,
1917 int __user *child_tidptr,
1918 unsigned long tls)
1919 {
1920 struct task_struct *p;
1921 int trace = 0;
1922 long nr;
...
//複製出一個新進程(子進程),並返回該進程的描述符
1942 p = copy_process(clone_flags, stack_start, stack_size,
1943 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
1944 add_latent_entropy();
1945 /*
1946 * Do this prior waking up the new thread - the thread pointer
1947 * might get invalid after that point, if the thread exits quickly.
1948 */
1949 if (!IS_ERR(p)) {
1950 struct completion vfork;
1951 struct pid *pid;
1952
1953 trace_sched_process_fork(current, p);
1954 //給新進程(子進程)分配一個pid
1955 pid = get_task_pid(p, PIDTYPE_PID);
1956 nr = pid_vnr(pid);
... //將新進程(子進程)加入調度器中,分配cpu,準備執行
1967 wake_up_new_task(p);
...
1978 put_pid(pid);
1979 } else {
1980 nr = PTR_ERR(p);
1981 }
1982 return nr; //父進程就返回了,子進程是不會跑到這裏的
1983 }
我們接着看copy_process()做了什麼
1455 static __latent_entropy struct task_struct *copy_process(
1456 unsigned long clone_flags,
1457 unsigned long stack_start,
1458 unsigned long stack_size,
1459 int __user *child_tidptr,
1460 struct pid *pid,
1461 int trace,
1462 unsigned long tls,
1463 int node)
1464 {
1465 int retval;
1466 struct task_struct *p;
...
1515 p = dup_task_struct(current, node);
...
1670 retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
...
1682 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1683 /*
1684 * Clear TID on mm_release()?
1685 */
1686 p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
...
1834 return p;
...
1882 }
調用dup_task_struct()創建一個task_struct 結構體和創建內核棧
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
474 {
475 struct task_struct *tsk;
476 unsigned long *stack;
477 struct vm_struct *stack_vm_area;
478 int err;
479
480 if (node == NUMA_NO_NODE)
481 node = tsk_fork_get_node(orig);
482 tsk = alloc_task_struct_node(node); //申請task_struct 空間
483 if (!tsk)
484 return NULL;
485 //創建內核棧
486 stack = alloc_thread_stack_node(tsk, node);
487 if (!stack)
488 goto free_tsk;
489
490 stack_vm_area = task_stack_vm_area(tsk);
491
492 err = arch_dup_task_struct(tsk, orig);
493
494 /*
495 * arch_dup_task_struct() clobbers the stack-related fields. Make
496 * sure they're properly initialized before using any stack-related
497 * functions again.
498 */
499 tsk->stack = stack; //task_struct有個成員執行內核棧的起始地址
...
545 return tsk;
546
547 free_stack:
548 free_thread_stack(tsk);
549 free_tsk:
550 free_task_struct(tsk);
551 return NULL;
552 }
copy_process()還調用 copy_thread_tls()對內核棧進行修改linux-4.10/include/linux/sched.h
2981 /* Architectures that haven't opted into copy_thread_tls get the tls argument
2982 * via pt_regs, so ignore the tls argument passed via C. */
2983 static inline int copy_thread_tls(
2984 unsigned long clone_flags, unsigned long sp, unsigned long arg,
2985 struct task_struct *p, unsigned long tls)
2986 {
2987 return copy_thread(clone_flags, sp, arg, p);
2988 }
arm64的實現在 linux-4.10/arch/arm64/kernel/process.c
int copy_thread(unsigned long clone_flags, unsigned long stack_start,
253 unsigned long stk_sz, struct task_struct *p)
254 {
255 struct pt_regs *childregs = task_pt_regs(p);
256
257 memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
258
259 if (likely(!(p->flags & PF_KTHREAD))) {
260 *childregs = *current_pt_regs(); //複製當前進程的寄存器值
//修改子進程的返回值,用戶空間的返回值,這也是子進程調度執行後返回值爲0的原因
261 childregs->regs[0] = 0;
...
282 } else {
...
290 }
291 p->thread.cpu_context.pc = (unsigned long)ret_from_fork; //子進程調度後執行的函數
292 p->thread.cpu_context.sp = (unsigned long)childregs;
293
294 ptrace_hw_copy_thread(p);
295
296 return 0;
297 }
子進程分配到cpu執行後,會跑到 ret_from_forklinux-4.10/arch/arm64/kernel/entry.S
/*
770 * "slow" syscall return path.
771 */
772 ret_to_user:
773 disable_irq // disable interrupts
774 ldr x1, [tsk, #TSK_TI_FLAGS]
775 and x2, x1, #_TIF_WORK_MASK
776 cbnz x2, work_pending
777 finish_ret_to_user:
778 enable_step_tsk x1, x2
779 kernel_exit 0
780 ENDPROC(ret_to_user)
781
782 /*
783 * This is how we return from a fork.
784 */
785 ENTRY(ret_from_fork)
786 bl schedule_tail
787 cbz x19, 1f // not a kernel thread
788 mov x0, x20
789 blr x19
790 1: get_thread_info tsk
791 b ret_to_user
792 ENDPROC(ret_from_fork)
最後跑到 kernel_exit 0 返回到用戶空間,在進入 el0_sync 的時候跑了 kernel_entry 0 ,處理完成後跑 kernel_exit 0 返回,我們看一下這裏做了什麼。
linux-4.10/arch/arm64/kernel/entry.S
/*
783 * This is how we return from a fork.
784 */
785 ENTRY(ret_from_fork)
786 bl schedule_tail
787 cbz x19, 1f // not a kernel thread
788 mov x0, x20
789 blr x19
790 1: get_thread_info tsk
791 b ret_to_user
792 ENDPROC(ret_from_fork)
72 .macro kernel_entry, el, regsize = 64
73 sub sp, sp, #S_FRAME_SIZE //sp = sp - S_FRAME_SIZE DEFINE(S_FRAME_SIZE,sizeof(struct pt_regs)); sp 指向棧頂
74 .if \regsize == 32
75 mov w0, w0 // zero upper 32 bits of x0
76 .endif
77 stp x0, x1, [sp, #16 * 0] //相當於 地址(sp + 16* 0) = x0,x1
78 stp x2, x3, [sp, #16 * 1] //相當於 地址(sp + 16* 1) = x2,x3
79 stp x4, x5, [sp, #16 * 2]
80 stp x6, x7, [sp, #16 * 3]
81 stp x8, x9, [sp, #16 * 4]
82 stp x10, x11, [sp, #16 * 5]
83 stp x12, x13, [sp, #16 * 6]
84 stp x14, x15, [sp, #16 * 7]
85 stp x16, x17, [sp, #16 * 8]
86 stp x18, x19, [sp, #16 * 9]
87 stp x20, x21, [sp, #16 * 10]
88 stp x22, x23, [sp, #16 * 11]
89 stp x24, x25, [sp, #16 * 12]
90 stp x26, x27, [sp, #16 * 13]
91 stp x28, x29, [sp, #16 * 14] // FP(x29)寄存器保存棧幀地址
92
93 .if \el == 0
94 mrs x21, sp_el0
95 ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear,
96 ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug
97 disable_step_tsk x19, x20 // exceptions when scheduling.
98
99 mov x29, xzr // fp pointed to user-space x29 = xzr =0
100 .else
101 add x21, sp, #S_FRAME_SIZE //x21= sp + S_FRAME_SIZE
102 get_thread_info tsk //等價於將thread_info指針保存到tsk(x28)寄存器
103 /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
//DEFINE(TSK_TI_ADDR_LIMIT,offsetof(struct task_struct, thread_info.addr_limit));
104 ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] // x20 = tsk + TSK_TI_ADDR_LIMIT
105 str x20, [sp, #S_ORIG_ADDR_LIMIT]
106 mov x20, #TASK_SIZE_64
107 str x20, [tsk, #TSK_TI_ADDR_LIMIT]
108 /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
109 .endif /* \el == 0 */
110 mrs x22, elr_el1 //mrs 是讀取系統寄存器指令 x22 = elr_el1
111 mrs x23, spsr_el1
112 stp lr, x21, [sp, #S_LR] //相當於 地址*(sp + S_LR) = lr,x21 (x21 = sp_el0)
113
114 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
115 /*
116 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
117 * EL0, there is no need to check the state of TTBR0_EL1 since
118 * accesses are always enabled.
119 * Note that the meaning of this bit differs from the ARMv8.1 PAN
120 * feature as all TTBR0_EL1 accesses are disabled, not just those to
121 * user mappings.
122 */
123 alternative_if ARM64_HAS_PAN
124 b 1f // skip TTBR0 PAN
125 alternative_else_nop_endif
126
127 .if \el != 0
128 mrs x21, ttbr0_el1
129 tst x21, #0xffff << 48 // Check for the reserved ASID
130 orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
131 b.eq 1f // TTBR0 access already disabled
132 and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR
133 .endif
134
135 __uaccess_ttbr0_disable x21
136 1:
137 #endif
138
139 stp x22, x23, [sp, #S_PC] //*(sp+ S_PC) =x22,x23
140
141 /*
142 * Set syscallno to -1 by default (overridden later if real syscall).
143 */
144 .if \el == 0
//mvn:與mov指令用法差不多,唯一的區別是:它賦值的時候,先按位取反
145 mvn x21, xzr // XZR 零寄存器,寫該寄存器會被忽略,讀該寄存器會得到全0值。
146 str x21, [sp, #S_SYSCALLNO] // *(sp + S_SYSCALLNO) = x21
147 .endif
148
149 /*
150 * Set sp_el0 to current thread_info.
151 */
152 .if \el == 0
153 msr sp_el0, tsk //sp_el0 = tsk(x28)
154 .endif
155
156 /*
157 * Registers that may be useful after this macro is invoked:
158 *
159 * x21 - aborted SP //
160 * x22 - aborted PC //
161 * x23 - aborted PSTATE
162 */
163 .endm
結合structpt_regs 的定義,我們看一下內核棧發生了什麼變化
linux-4.10/arch/arm64/include/asm/ptrace.h
struct pt_regs {
109 union {
110 struct user_pt_regs user_regs;
111 struct {
112 u64 regs[31];
113 u64 sp;
114 u64 pc;
115 u64 pstate;
116 };
117 };
118 u64 orig_x0;
119 u64 syscallno;
120 u64 orig_addr_limit;
121 u64 unused; // maintain 16 byte alignment
122 };
所以kernel_entry 0 執行之後,就是把一些寄存器的值保存到了內核棧中,pt_regs這個結構體放在棧頂,後面返回用戶空間時,把這些寄存器的值還原回去。
linux-4.10/arch/arm64/kernel/entry.S
.macro kernel_exit, el
166 .if \el != 0
167 /* Restore the task's original addr_limit. */
168 ldr x20, [sp, #S_ORIG_ADDR_LIMIT]
169 str x20, [tsk, #TSK_TI_ADDR_LIMIT]
170
171 /* No need to restore UAO, it will be restored from SPSR_EL1 */
172 .endif
173
174 ldp x21, x22, [sp, #S_PC] // load ELR, SPSR
175 .if \el == 0
176 ct_user_enter
177 .endif
178
179 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
180 /*
181 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
182 * PAN bit checking.
183 */
184 alternative_if ARM64_HAS_PAN
185 b 2f // skip TTBR0 PAN
186 alternative_else_nop_endif
187
188 .if \el != 0
189 tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
190 .endif
191
192 __uaccess_ttbr0_enable x0
193
194 .if \el == 0
195 /*
196 * Enable errata workarounds only if returning to user. The only
197 * workaround currently required for TTBR0_EL1 changes are for the
198 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
199 * corruption).
200 */
201 post_ttbr0_update_workaround
202 .endif
203 1:
204 .if \el != 0
205 and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit
206 .endif
207 2:
208 #endif
209
210 .if \el == 0
211 ldr x23, [sp, #S_SP] // load return stack pointer
212 msr sp_el0, x23
213 #ifdef CONFIG_ARM64_ERRATUM_845719
214 alternative_if ARM64_WORKAROUND_845719
215 tbz x22, #4, 1f
216 #ifdef CONFIG_PID_IN_CONTEXTIDR
217 mrs x29, contextidr_el1
218 msr contextidr_el1, x29
219 #else
220 msr contextidr_el1, xzr
221 #endif
222 1:
223 alternative_else_nop_endif
224 #endif
225 .endif
226
227 msr elr_el1, x21 // set up the return data
228 msr spsr_el1, x22
229 ldp x0, x1, [sp, #16 * 0] //相當於 x0,x1 = *(sp + 16*0)
230 ldp x2, x3, [sp, #16 * 1] //相當於 x2,x3 = *(sp + 16*1)
231 ldp x4, x5, [sp, #16 * 2]
232 ldp x6, x7, [sp, #16 * 3]
233 ldp x8, x9, [sp, #16 * 4]
234 ldp x10, x11, [sp, #16 * 5]
235 ldp x12, x13, [sp, #16 * 6]
236 ldp x14, x15, [sp, #16 * 7]
237 ldp x16, x17, [sp, #16 * 8]
238 ldp x18, x19, [sp, #16 * 9]
239 ldp x20, x21, [sp, #16 * 10]
240 ldp x22, x23, [sp, #16 * 11]
241 ldp x24, x25, [sp, #16 * 12]
242 ldp x26, x27, [sp, #16 * 13]
243 ldp x28, x29, [sp, #16 * 14]
244 ldr lr, [sp, #S_LR]
245 add sp, sp, #S_FRAME_SIZE // restore sp
246 eret // 異常返回,使用當前的SPSR_ELx和ELR_ELx
247 .endm
所以kernel_exit 0 就是在返回用戶空間之前,把保存在pt_regs 結構體中的值重新寫好對應的寄存器,注意在copy_thread()已經把子進程pt_regs修改了 childregs->regs[0]= 0; (後面賦值給x0寄存器),在arm中函數的返回值一般都放在x0 (32位是R0)中,所以子進程從fork()函數返回的值是0。
2、execve()加載進程
因爲系統調用的具體細節上面也有分析過了,這裏就不再重複,直接走到execve()在kernel中對應的處理函數。
linux-4.10/include/uapi/asm-generic/unistd.h
607 #define __NR_execve 221
608 __SC_COMP(__NR_execve, sys_execve, compat_sys_execve)
和sys_clone()一樣,對應的實現函數用宏包了起來,在 linux-4.10/fs/exec.c
887 SYSCALL_DEFINE3(execve,
1888 const char __user *, filename,
1889 const char __user *const __user *, argv,
1890 const char __user *const __user *, envp)
1891 {
1892 return do_execve(getname(filename), argv, envp);
1893 }
接着跑到do_execve()函數中
int do_execve(struct filename *filename,
1806 const char __user *const __user *__argv,
1807 const char __user *const __user *__envp)
1808 {
1809 struct user_arg_ptr argv = { .ptr.native = __argv };
1810 struct user_arg_ptr envp = { .ptr.native = __envp };
1811 return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1812 }
關鍵的實現就在do_execveat_common()這個函數裏面了
/*
1657 * sys_execve() executes a new program.
1658 */
1659 static int do_execveat_common(int fd, struct filename *filename,
1660 struct user_arg_ptr argv,
1661 struct user_arg_ptr envp,
1662 int flags)
1663 {
1664 char *pathbuf = NULL;
1665 struct linux_binprm *bprm;
1666 struct file *file;
1667 struct files_struct *displaced;
1668 int retval;
...
1693 retval = -ENOMEM;
1694 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); //申請一個linux_binprm 結構體
...
1705 file = do_open_execat(fd, filename, flags); //打開可執行文件
1706 retval = PTR_ERR(file);
1707 if (IS_ERR(file))
1708 goto out_unmark;
1709
1710 sched_exec();
1711
...
1736 retval = bprm_mm_init(bprm); //創建進程的內存地址空間
...
1748 retval = prepare_binprm(bprm); //讀取可執行文件前面的128字節
...
1767 retval = exec_binprm(bprm); //加載程序到內存並運行
...
1771 /* execve succeeded */
1772 current->fs->in_exec = 0;
1773 current->in_execve = 0;
1774 acct_update_integrals(current);
1775 task_numa_free(current);
1776 free_bprm(bprm);
1777 kfree(pathbuf);
1778 putname(filename);
1779 if (displaced)
1780 put_files_struct(displaced);
1781 return retval;
...
1803 }
我們重點看一下exec_binprm()1634 static int exec_binprm(struct linux_binprm *bprm)
1635 {
1636 pid_t old_pid, old_vpid;
1637 int ret;
1638
1639 /* Need to fetch pid before load_binary changes it */
1640 old_pid = current->pid;
1641 rcu_read_lock();
1642 old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1643 rcu_read_unlock();
1644
1645 ret = search_binary_handler(bprm); //嘗試加載該可執行程序
1646 if (ret >= 0) {
1647 audit_bprm(bprm);
1648 trace_sched_process_exec(current, old_pid, bprm);
1649 ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1650 proc_exec_connector(current);
1651 }
1652
1653 return ret;
1654 }
1581 int search_binary_handler(struct linux_binprm *bprm)
1582 {
1583 bool need_retry = IS_ENABLED(CONFIG_MODULES);
1584 struct linux_binfmt *fmt;
1585 int retval;
...
1595 retval = -ENOENT;
1596 retry:
1597 read_lock(&binfmt_lock);
1598 list_for_each_entry(fmt, &formats, lh) {
1599 if (!try_module_get(fmt->module))
1600 continue;
1601 read_unlock(&binfmt_lock);
1602 bprm->recursion_depth++;
1603 retval = fmt->load_binary(bprm); //這裏纔是真正調用到handler 加載
1604 read_lock(&binfmt_lock);
1605 put_binfmt(fmt);
1606 bprm->recursion_depth--;
1607 if (retval < 0 && !bprm->mm) {
1608 /* we got to flush_old_exec() and failed after it */
1609 read_unlock(&binfmt_lock);
1610 force_sigsegv(SIGSEGV, current);
1611 return retval;
1612 }
1613 if (retval != -ENOEXEC || !bprm->file) {
1614 read_unlock(&binfmt_lock);
1615 return retval;
1616 }
1617 }
...
1630 return retval;
1631 }
search_binary_handler()會嘗試調用註冊的handler 去加載程序
linux-4.10/fs/exec.c
72 static LIST_HEAD(formats);
73 static DEFINE_RWLOCK(binfmt_lock);
74
75 void __register_binfmt(struct linux_binfmt * fmt, int insert)
76 {
77 BUG_ON(!fmt);
78 if (WARN_ON(!fmt->load_binary))
79 return;
80 write_lock(&binfmt_lock);
81 insert ? list_add(&fmt->lh, &formats) :
82 list_add_tail(&fmt->lh, &formats);
83 write_unlock(&binfmt_lock);
84 }
exec.c中創建了formats 對應的鏈表結構體,並提供了註冊的方法
linux-4.10/include/linux/binfmts.h
86 /* Registration of default binfmt handlers */
87 static inline void register_binfmt(struct linux_binfmt *fmt)
88 {
89 __register_binfmt(fmt, 0);
90 }
91 /* Same as above, but adds a new binfmt at the top of the list */
92 static inline void insert_binfmt(struct linux_binfmt *fmt)
93 {
94 __register_binfmt(fmt, 1);
95 }
linux-4.10/fs/binfmt_elf.c
2326 static int __init init_elf_binfmt(void)
2327 {
2328 register_binfmt(&elf_format);
2329 return 0;
2330 }
在elf 初始化函數裏面把它註冊了進來
84 static struct linux_binfmt elf_format = {
85 .module = THIS_MODULE,
86 .load_binary = load_elf_binary,
87 .load_shlib = load_elf_library,
88 .core_dump = elf_core_dump,
89 .min_coredump = ELF_EXEC_PAGESIZE,
90 };
所以上面的 retval = fmt->load_binary(bprm); 對應的是 load_elf_binary()函數
668 static int load_elf_binary(struct linux_binprm *bprm)
669 {
...
696 /* Get the exec-header */
697 loc->elf_ex = *((struct elfhdr *)bprm->buf); //可執行程序文件頭的128字節
698 //下面就是根據文件頭來判斷是否加載
699 retval = -ENOEXEC;
700 /* First of all, some simple consistency checks */
701 if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
702 goto out;
703
704 if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
705 goto out;
706 if (!elf_check_arch(&loc->elf_ex))
707 goto out;
... //加載過程省略
1090 start_thread(regs, elf_entry, bprm->p); //加載完成之後跑進程裏面的代碼
1091 retval = 0;
1092 out:
1093 kfree(loc);
1094 out_ret:
1095 return retval;
1108 }
加載的過程這裏就不細講了,這個過程中會解析elf 的一些段,並執行一些初始化操作,也就是在程序中的main()函數還沒跑起來之前就已經跑了很多其他的代碼,後面會介紹elf文件格式。