linux 進程的創建和加載

我們還是一cameraserver進程爲例,看它是如何被系統加載運行起來的。

frameworks/av/camera/cameraserver/cameraserver.rc

1 service cameraserver /system/bin/cameraserver
2     class main
3     user cameraserver
4     group audio camera input drmrpc
5     ioprio rt 4
6     writepid /dev/cpuset/camera-daemon/tasks /dev/stune/top-app/tasks

從cameraserver.rc我們可以知道cameraserver 這個service對應的程序文件爲/system/bin/cameraserver,在init進程解析完對應的rc文件後,會把這個進程運行起來,解析過程就不詳述。

system/core/init/builtins.cpp

261  static int do_exec(const std::vector<std::string>& args) {
262      Service* svc = ServiceManager::GetInstance().MakeExecOneshotService(args);
263      if (!svc) {
264          return -1;
265      }
266      if (!svc->Start()) {
267          return -1;
268      }
269      waiting_for_exec = true;
270      return 0;
271  }
init進程會調用do_exec()函數就是把service對應的進程運行起來,這個函數在不同版本的android源碼中,裏面的代碼有些不一樣。

system/core/init/service.cpp

318  bool Service::Start() {
         ... 
388      NOTICE("Starting service '%s'...\n", name_.c_str());
389  
390      pid_t pid = fork();  //創建出一個新進程
391      if (pid == 0) {  //如果是子進程,會走進這個條件
392          umask(077);
          ... 
476          if (execve(strs[0], (char**) &strs[0], (char**) ENV) < 0) {  //加載進程
477              ERROR("cannot execve('%s'): %s\n", strs[0], strerror(errno));
478          }
479  
480          _exit(127);
481      }
482  
483      if (pid < 0) {  //如果fork 進程失敗
484          ERROR("failed to start '%s'\n", name_.c_str());
485          pid_ = 0;
486          return false;
487      }
488      //父進程,也就是init進程會跑下面的代碼
489      time_started_ = gettime();
490      pid_ = pid;
         ...
499      NotifyStateChange("running");
500      return true;
501  }

到這裏基本上就很清晰了,init啓動service進程主要是通過fork()和execve()實現的,所以下面我們重點跟蹤和了解這兩個函數的實現。


1、fork創建子進程

fork()函數聲明在  bionic/libc/include/unistd.h

…
81  extern pid_t  fork(void);
82  extern pid_t  vfork(void);
…

fork()實現在  bionic/libc/bionic/fork.cpp

34  #define FORK_FLAGS (CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | SIGCHLD)
36  int fork() {
37    __bionic_atfork_run_prepare();
38  
39    pthread_internal_t* self = __get_thread();
40  
41    // Remember the parent pid and invalidate the cached value while we fork.
42    pid_t parent_pid = self->invalidate_cached_pid();
43  
44  #if defined(__x86_64__) // sys_clone's last two arguments are flipped on x86-64.
45    int result = syscall(__NR_clone, FORK_FLAGS, NULL, NULL, &(self->tid), NULL);
46  #else
47    int result = syscall(__NR_clone, FORK_FLAGS, NULL, NULL, NULL, &(self->tid));
48  #endif
49    if (result == 0) {
50      self->set_cached_pid(gettid());
51      __bionic_atfork_run_child();
52    } else {
53      self->set_cached_pid(parent_pid);
54      __bionic_atfork_run_parent();
55    }
56    return result;
57  }
裏面又通過syscall()系統調用函數來創建進程,注意傳入的參數是__NR_clone和FORK_FLAGS。

syscall在 arm64的實現   bionic/libc/arch-arm64/bionic/syscall.S

29  #include <private/bionic_asm.h>
30  
31  ENTRY(syscall)
32      /* Move syscall No. from x0 to x8 */
33      mov     x8, x0
34      /* Move syscall parameters from x1 thru x6 to x0 thru x5 */
35      mov     x0, x1
36      mov     x1, x2
37      mov     x2, x3
38      mov     x3, x4
39      mov     x4, x5
40      mov     x5, x6
41      svc     #0  //可以認爲執行了svc指令後,會到kernel中斷相應函數
42  
43      /* check if syscall returned successfully */
44      cmn     x0, #(MAX_ERRNO + 1)  //x0 - MAX_ERRNO -1
45      cneg    x0, x0, hi    //
46      b.hi    __set_errno_internal
47  
48      ret  //調用返回
49  END(syscall)

kernel對應的處理流程

linux-4.10/arch/arm64/kernel/entry.S

…
328  	ventry	el0_sync			// Synchronous 64-bit EL0  
329  	ventry	el0_irq				// IRQ 64-bit EL0
330  	ventry	el0_fiq_invalid			// FIQ 64-bit EL0
331  	ventry	el0_error_invalid		// Error 64-bit EL0
…

我們繼續看el0_sync:的實現

511  /*
512   * EL0 mode handlers.
513   */
514  	.align	6
515  el0_sync:
516  	kernel_entry 0    //kernel_entry 很重要,後面會講
517  	mrs	x25, esr_el1			// read the syndrome register
518  	lsr	x24, x25, #ESR_ELx_EC_SHIFT	// exception class
519  	cmp	x24, #ESR_ELx_EC_SVC64		// SVC in 64-bit state  //比較兩個值是否相等
520  	b.eq	el0_svc  //如果上面比較的結果相等,就跳轉到el0_svc
...
接着看el0_sev裏面的處理
/*
795   * SVC handler.
796   */
797  	.align	6
798  el0_svc:
799  	adrp	stbl, sys_call_table		// load syscall table pointer  //處理函數表格地址
        //在syscall.S中通過指令 mov x8, x0 將__NR_clone 參數保存到x8,下面的w8 也就是x8
800  	uxtw	scno, w8			// syscall number in w8  
801  	mov	sc_nr, #__NR_syscalls   //系統調用number的最大值
802  el0_svc_naked:					// compat entry point
803  	stp	x0, scno, [sp, #S_ORIG_X0]	// save the original x0 and syscall number
804  	enable_dbg_and_irq
805  	ct_user_exit 1
806  
807  	ldr	x16, [tsk, #TSK_TI_FLAGS]	// check for syscall hooks
808  	tst	x16, #_TIF_SYSCALL_WORK
809  	b.ne	__sys_trace
810  	cmp     scno, sc_nr                     // check upper syscall limit
811  	b.hs	ni_sys
812  	ldr	x16, [stbl, scno, lsl #3]	// address in the syscall table
813  	blr	x16				// call sys_* routine 
814  	b	ret_fast_syscall       
815  ni_sys:
816  	mov	x0, sp
817  	bl	do_ni_syscall
818  	b	ret_fast_syscall
819  ENDPROC(el0_svc)

我們重點關注下面這兩條指令

812        ldr    x16,[stbl, scno, lsl #3]   // address in thesyscall table

813        blr    x16                             // call sys_*routine

LDR R0,[R1,R2,LSL #3]   將存儲器地址爲R1+R2*8的字數據讀入寄存器R0。

blr  Xm:跳轉到由Xm目標寄存器指定的地址處執行


stbl 定義在linux-4.10/arch/arm64/kernel/sys.c

55  #undef __SYSCALL
56  #define __SYSCALL(nr, sym)	[nr] = sym,
57  
58  /*
59   * The sys_call_table array must be 4K aligned to be accessible from
60   * kernel/entry.S.
61   */
62  void * const sys_call_table[__NR_syscalls] __aligned(4096) = {
63  	[0 ... __NR_syscalls - 1] = sys_ni_syscall,
64  #include <asm/unistd.h>
65  };

linux-4.10/include/uapi/asm-generic/unistd.h

604  /* arch/example/kernel/sys_example.c */
605  #define __NR_clone 220
606  __SYSCALL(__NR_clone, sys_clone)
607  #define __NR_execve 221
608  __SC_COMP(__NR_execve, sys_execve, compat_sys_execve)
sys_call_table是一個保存函數地址的數組,所以下面的兩行代碼可以認爲是取出數組下標對應的處理函數,然後跳到函數執行,就像函數指針一樣的調用。

812        ldr    x16,[stbl, scno, lsl #3]   // address in thesyscall table

813        blr    x16                             // call sys_*routine

所以緊接着調用到sys_clone()函數,sys_clone 對應的實現在fork.c中

宏定義在linux-4.10/include/linux/syscalls.h

178  #define SYSCALL_DEFINE0(sname)					\
179  	SYSCALL_METADATA(_##sname, 0);				\
180  	asmlinkage long sys_##sname(void)
181  
182  #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
183  #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
184  #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
185  #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
186  #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
187  #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

189  #define SYSCALL_DEFINEx(x, sname, ...)				\
190  	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
191  	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
192  
193  #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
194  #define __SYSCALL_DEFINEx(x, name, ...)					\
195  	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
196  		__attribute__((alias(__stringify(SyS##name))));		\
197  	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
198  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
199  	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
200  	{								\
201  		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
202  		__MAP(x,__SC_TEST,__VA_ARGS__);				\
203  		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
204  		return ret;						\
205  	}								\
206  	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

linux-4.10/kernel/fork.c

 /*
2000   * Create a kernel thread.
2001   */
2002  pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
2003  {
2004  	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
2005  		(unsigned long)arg, NULL, NULL, 0);
2006  }
2007  
2008  #ifdef __ARCH_WANT_SYS_FORK
2009  SYSCALL_DEFINE0(fork)
2010  {
2011  #ifdef CONFIG_MMU
2012  	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
2013  #else
2014  	/* can not support in nommu mode */
2015  	return -EINVAL;
2016  #endif
2017  }
2018  #endif
2019  
2020  #ifdef __ARCH_WANT_SYS_VFORK
2021  SYSCALL_DEFINE0(vfork)
2022  {
2023  	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
2024  			0, NULL, NULL, 0);
2025  }
2026  #endif
2027  
2028  #ifdef __ARCH_WANT_SYS_CLONE
2029  #ifdef CONFIG_CLONE_BACKWARDS
2030  SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2031  		 int __user *, parent_tidptr,
2032  		 unsigned long, tls,
2033  		 int __user *, child_tidptr)
2034  #elif defined(CONFIG_CLONE_BACKWARDS2)
2035  SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
2036  		 int __user *, parent_tidptr,
2037  		 int __user *, child_tidptr,
2038  		 unsigned long, tls)
2039  #elif defined(CONFIG_CLONE_BACKWARDS3)
2040  SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
2041  		int, stack_size,
2042  		int __user *, parent_tidptr,
2043  		int __user *, child_tidptr,
2044  		unsigned long, tls)
2045  #else
2046  SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
2047  		 int __user *, parent_tidptr,
2048  		 int __user *, child_tidptr,
2049  		 unsigned long, tls)
2050  #endif
2051  {
2052  	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
2053  }
2054  #endif
所以上面貼出了一大推東西,最終都是到_do_fork()裏面實現,通過上面的代碼我們還知道kernel_thread(),fork(),vfork(),clone()都會調用到_do_fork(),只不過傳入的參數不一樣而已。所以創建進程在kernel中都會調用到_do_fork(),所以以後在程序中看到fork(),vfork(),clone()這些接口,可以忽略中間的調用過程,直接看_do_fork()就可以了,只需要注意傳進來的參數就可以了。

1913  long _do_fork(unsigned long clone_flags,
1914  	      unsigned long stack_start,
1915  	      unsigned long stack_size,
1916  	      int __user *parent_tidptr,
1917  	      int __user *child_tidptr,
1918  	      unsigned long tls)
1919  {
1920  	struct task_struct *p;
1921  	int trace = 0;
1922  	long nr;
        ...
        //複製出一個新進程(子進程),並返回該進程的描述符
1942  	p = copy_process(clone_flags, stack_start, stack_size,
1943  			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
1944  	add_latent_entropy();
1945  	/*
1946  	 * Do this prior waking up the new thread - the thread pointer
1947  	 * might get invalid after that point, if the thread exits quickly.
1948  	 */
1949  	if (!IS_ERR(p)) {
1950  		struct completion vfork;
1951  		struct pid *pid;
1952  
1953  		trace_sched_process_fork(current, p);
1954            //給新進程(子進程)分配一個pid
1955  		pid = get_task_pid(p, PIDTYPE_PID);
1956  		nr = pid_vnr(pid);
                ... //將新進程(子進程)加入調度器中,分配cpu,準備執行
1967  		wake_up_new_task(p);
                ...
1978  		put_pid(pid);
1979  	} else {
1980  		nr = PTR_ERR(p);
1981  	}
1982  	return nr;   //父進程就返回了,子進程是不會跑到這裏的
1983  }

我們接着看copy_process()做了什麼
1455  static __latent_entropy struct task_struct *copy_process(
1456  					unsigned long clone_flags,
1457  					unsigned long stack_start,
1458  					unsigned long stack_size,
1459  					int __user *child_tidptr,
1460  					struct pid *pid,
1461  					int trace,
1462  					unsigned long tls,
1463  					int node)
1464  {
1465  	int retval;
1466  	struct task_struct *p;
        ...
1515  	p = dup_task_struct(current, node);
        ...
1670  	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
        ...
1682  	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1683  	/*
1684  	 * Clear TID on mm_release()?
1685  	 */
1686  	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
        ...
1834  	return p;
...
1882  }

調用dup_task_struct()創建一個task_struct 結構體和創建內核棧
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
474  {
475  	struct task_struct *tsk;
476  	unsigned long *stack;
477  	struct vm_struct *stack_vm_area;
478  	int err;
479  
480  	if (node == NUMA_NO_NODE)
481  		node = tsk_fork_get_node(orig);
482  	tsk = alloc_task_struct_node(node);  //申請task_struct 空間
483  	if (!tsk)
484  		return NULL;
485     //創建內核棧
486  	stack = alloc_thread_stack_node(tsk, node);
487  	if (!stack)
488  		goto free_tsk;
489  
490  	stack_vm_area = task_stack_vm_area(tsk);
491  
492  	err = arch_dup_task_struct(tsk, orig);
493  
494  	/*
495  	 * arch_dup_task_struct() clobbers the stack-related fields.  Make
496  	 * sure they're properly initialized before using any stack-related
497  	 * functions again.
498  	 */
499  	tsk->stack = stack;  //task_struct有個成員執行內核棧的起始地址
        ...
545  	return tsk;
546  
547  free_stack:
548  	free_thread_stack(tsk);
549  free_tsk:
550  	free_task_struct(tsk);
551  	return NULL;
552  }
copy_process()還調用 copy_thread_tls()對內核棧進行修改

linux-4.10/include/linux/sched.h

2981  /* Architectures that haven't opted into copy_thread_tls get the tls argument
2982   * via pt_regs, so ignore the tls argument passed via C. */
2983  static inline int copy_thread_tls(
2984  		unsigned long clone_flags, unsigned long sp, unsigned long arg,
2985  		struct task_struct *p, unsigned long tls)
2986  {
2987  	return copy_thread(clone_flags, sp, arg, p);
2988  }

arm64的實現在  linux-4.10/arch/arm64/kernel/process.c

int copy_thread(unsigned long clone_flags, unsigned long stack_start,
253  		unsigned long stk_sz, struct task_struct *p)
254  {
255  	struct pt_regs *childregs = task_pt_regs(p);
256  
257  	memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
258  
259  	if (likely(!(p->flags & PF_KTHREAD))) {
260  		*childregs = *current_pt_regs(); //複製當前進程的寄存器值
                //修改子進程的返回值,用戶空間的返回值,這也是子進程調度執行後返回值爲0的原因
261  		childregs->regs[0] = 0;  
        ...
282  	} else {
        ...
290  	}
291  	p->thread.cpu_context.pc = (unsigned long)ret_from_fork; //子進程調度後執行的函數
292  	p->thread.cpu_context.sp = (unsigned long)childregs;
293  
294  	ptrace_hw_copy_thread(p);
295  
296  	return 0;
297  }
子進程分配到cpu執行後,會跑到  ret_from_fork

linux-4.10/arch/arm64/kernel/entry.S

/*
770   * "slow" syscall return path.
771   */
772  ret_to_user:
773  	disable_irq				// disable interrupts
774  	ldr	x1, [tsk, #TSK_TI_FLAGS]
775  	and	x2, x1, #_TIF_WORK_MASK
776  	cbnz	x2, work_pending
777  finish_ret_to_user:
778  	enable_step_tsk x1, x2
779  	kernel_exit 0
780  ENDPROC(ret_to_user)
781  
782  /*
783   * This is how we return from a fork.
784   */
785  ENTRY(ret_from_fork)
786  	bl	schedule_tail
787  	cbz	x19, 1f				// not a kernel thread
788  	mov	x0, x20
789  	blr	x19
790  1:	get_thread_info tsk
791  	b	ret_to_user
792  ENDPROC(ret_from_fork)

最後跑到 kernel_exit 0 返回到用戶空間,在進入 el0_sync 的時候跑了 kernel_entry 0 ,處理完成後跑 kernel_exit 0 返回,我們看一下這裏做了什麼。

linux-4.10/arch/arm64/kernel/entry.S

/*
783   * This is how we return from a fork.
784   */
785  ENTRY(ret_from_fork)
786  	bl	schedule_tail
787  	cbz	x19, 1f				// not a kernel thread
788  	mov	x0, x20      
789  	blr	x19
790  1:	get_thread_info tsk
791  	b	ret_to_user
792  ENDPROC(ret_from_fork)

72  	.macro	kernel_entry, el, regsize = 64
73  	sub	sp, sp, #S_FRAME_SIZE   //sp = sp - S_FRAME_SIZE   DEFINE(S_FRAME_SIZE,sizeof(struct pt_regs)); sp 指向棧頂
74  	.if	\regsize == 32
75  	mov	w0, w0				// zero upper 32 bits of x0
76  	.endif
77  	stp	x0, x1, [sp, #16 * 0]   //相當於 地址(sp + 16* 0) = x0,x1
78  	stp	x2, x3, [sp, #16 * 1]   //相當於 地址(sp + 16* 1) = x2,x3
79  	stp	x4, x5, [sp, #16 * 2]
80  	stp	x6, x7, [sp, #16 * 3]
81  	stp	x8, x9, [sp, #16 * 4]
82  	stp	x10, x11, [sp, #16 * 5]
83  	stp	x12, x13, [sp, #16 * 6]
84  	stp	x14, x15, [sp, #16 * 7]
85  	stp	x16, x17, [sp, #16 * 8]
86  	stp	x18, x19, [sp, #16 * 9]
87  	stp	x20, x21, [sp, #16 * 10]
88  	stp	x22, x23, [sp, #16 * 11]
89  	stp	x24, x25, [sp, #16 * 12]
90  	stp	x26, x27, [sp, #16 * 13]
91  	stp	x28, x29, [sp, #16 * 14]  // FP(x29)寄存器保存棧幀地址
92  
93  	.if	\el == 0
94  	mrs	x21, sp_el0
95  	ldr_this_cpu	tsk, __entry_task, x20	// Ensure MDSCR_EL1.SS is clear,
96  	ldr	x19, [tsk, #TSK_TI_FLAGS]	// since we can unmask debug
97  	disable_step_tsk x19, x20		// exceptions when scheduling.
98  
99  	mov	x29, xzr			// fp pointed to user-space x29 = xzr =0
100  	.else
101  	add	x21, sp, #S_FRAME_SIZE //x21= sp + S_FRAME_SIZE
102  	get_thread_info tsk  //等價於將thread_info指針保存到tsk(x28)寄存器
103  	/* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
        //DEFINE(TSK_TI_ADDR_LIMIT,offsetof(struct task_struct, thread_info.addr_limit));
104  	ldr	x20, [tsk, #TSK_TI_ADDR_LIMIT]  // x20 = tsk + TSK_TI_ADDR_LIMIT
105  	str	x20, [sp, #S_ORIG_ADDR_LIMIT]
106  	mov	x20, #TASK_SIZE_64
107  	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
108  	/* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
109  	.endif /* \el == 0 */
110  	mrs	x22, elr_el1  //mrs 是讀取系統寄存器指令 x22 = elr_el1
111  	mrs	x23, spsr_el1
112  	stp	lr, x21, [sp, #S_LR] //相當於 地址*(sp + S_LR) = lr,x21 (x21 = sp_el0)
113  
114  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
115  	/*
116  	 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
117  	 * EL0, there is no need to check the state of TTBR0_EL1 since
118  	 * accesses are always enabled.
119  	 * Note that the meaning of this bit differs from the ARMv8.1 PAN
120  	 * feature as all TTBR0_EL1 accesses are disabled, not just those to
121  	 * user mappings.
122  	 */
123  alternative_if ARM64_HAS_PAN
124  	b	1f				// skip TTBR0 PAN
125  alternative_else_nop_endif
126  
127  	.if	\el != 0
128  	mrs	x21, ttbr0_el1
129  	tst	x21, #0xffff << 48		// Check for the reserved ASID
130  	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
131  	b.eq	1f				// TTBR0 access already disabled
132  	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
133  	.endif
134  
135  	__uaccess_ttbr0_disable x21
136  1:
137  #endif
138  
139  	stp	x22, x23, [sp, #S_PC]   //*(sp+ S_PC) =x22,x23
140  
141  	/*
142  	 * Set syscallno to -1 by default (overridden later if real syscall).
143  	 */
144  	.if	\el == 0
        //mvn:與mov指令用法差不多,唯一的區別是:它賦值的時候,先按位取反
145  	mvn	x21, xzr  // XZR 零寄存器,寫該寄存器會被忽略,讀該寄存器會得到全0值。
146  	str	x21, [sp, #S_SYSCALLNO]  // *(sp + S_SYSCALLNO) = x21 
147  	.endif
148  
149  	/*
150  	 * Set sp_el0 to current thread_info.
151  	 */
152  	.if	\el == 0
153  	msr	sp_el0, tsk  //sp_el0 = tsk(x28)
154  	.endif
155  
156  	/*
157  	 * Registers that may be useful after this macro is invoked:
158  	 *
159  	 * x21 - aborted SP   //
160  	 * x22 - aborted PC   //
161  	 * x23 - aborted PSTATE
162  	*/
163  	.endm

結合structpt_regs 的定義,我們看一下內核棧發生了什麼變化

linux-4.10/arch/arm64/include/asm/ptrace.h

struct pt_regs {
109  	union {
110  		struct user_pt_regs user_regs;
111  		struct {
112  			u64 regs[31];
113  			u64 sp;
114  			u64 pc;
115  			u64 pstate;
116  		};
117  	};
118  	u64 orig_x0;
119  	u64 syscallno;
120  	u64 orig_addr_limit;
121  	u64 unused;	// maintain 16 byte alignment
122  };

所以kernel_entry 0 執行之後,就是把一些寄存器的值保存到了內核棧中,pt_regs這個結構體放在棧頂,後面返回用戶空間時,把這些寄存器的值還原回去。



linux-4.10/arch/arm64/kernel/entry.S

.macro	kernel_exit, el
166  	.if	\el != 0
167  	/* Restore the task's original addr_limit. */
168  	ldr	x20, [sp, #S_ORIG_ADDR_LIMIT]
169  	str	x20, [tsk, #TSK_TI_ADDR_LIMIT]
170  
171  	/* No need to restore UAO, it will be restored from SPSR_EL1 */
172  	.endif
173  
174  	ldp	x21, x22, [sp, #S_PC]		// load ELR, SPSR
175  	.if	\el == 0
176  	ct_user_enter
177  	.endif
178  
179  #ifdef CONFIG_ARM64_SW_TTBR0_PAN
180  	/*
181  	 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
182  	 * PAN bit checking.
183  	 */
184  alternative_if ARM64_HAS_PAN
185  	b	2f				// skip TTBR0 PAN
186  alternative_else_nop_endif
187  
188  	.if	\el != 0
189  	tbnz	x22, #22, 1f			// Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
190  	.endif
191  
192  	__uaccess_ttbr0_enable x0
193  
194  	.if	\el == 0
195  	/*
196  	 * Enable errata workarounds only if returning to user. The only
197  	 * workaround currently required for TTBR0_EL1 changes are for the
198  	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
199  	 * corruption).
200  	 */
201  	post_ttbr0_update_workaround
202  	.endif
203  1:
204  	.if	\el != 0
205  	and	x22, x22, #~PSR_PAN_BIT		// ARMv8.0 CPUs do not understand this bit
206  	.endif
207  2:
208  #endif
209  
210  	.if	\el == 0
211  	ldr	x23, [sp, #S_SP]		// load return stack pointer
212  	msr	sp_el0, x23
213  #ifdef CONFIG_ARM64_ERRATUM_845719
214  alternative_if ARM64_WORKAROUND_845719
215  	tbz	x22, #4, 1f
216  #ifdef CONFIG_PID_IN_CONTEXTIDR
217  	mrs	x29, contextidr_el1
218  	msr	contextidr_el1, x29
219  #else
220  	msr contextidr_el1, xzr
221  #endif
222  1:
223  alternative_else_nop_endif
224  #endif
225  	.endif
226  
227  	msr	elr_el1, x21			// set up the return data
228  	msr	spsr_el1, x22
229  	ldp	x0, x1, [sp, #16 * 0]   //相當於 x0,x1 = *(sp + 16*0)
230  	ldp	x2, x3, [sp, #16 * 1]   //相當於 x2,x3 = *(sp + 16*1)
231  	ldp	x4, x5, [sp, #16 * 2]
232  	ldp	x6, x7, [sp, #16 * 3]
233  	ldp	x8, x9, [sp, #16 * 4]
234  	ldp	x10, x11, [sp, #16 * 5]
235  	ldp	x12, x13, [sp, #16 * 6]
236  	ldp	x14, x15, [sp, #16 * 7]
237  	ldp	x16, x17, [sp, #16 * 8]
238  	ldp	x18, x19, [sp, #16 * 9]
239  	ldp	x20, x21, [sp, #16 * 10]
240  	ldp	x22, x23, [sp, #16 * 11]
241  	ldp	x24, x25, [sp, #16 * 12]
242  	ldp	x26, x27, [sp, #16 * 13]
243  	ldp	x28, x29, [sp, #16 * 14]
244  	ldr	lr, [sp, #S_LR]
245  	add	sp, sp, #S_FRAME_SIZE		// restore sp
246  	eret					// 異常返回,使用當前的SPSR_ELx和ELR_ELx
247  	.endm

所以kernel_exit 0 就是在返回用戶空間之前,把保存在pt_regs 結構體中的值重新寫好對應的寄存器,注意在copy_thread()已經把子進程pt_regs修改了 childregs->regs[0]= 0; (後面賦值給x0寄存器),在arm中函數的返回值一般都放在x0 (32位是R0)中,所以子進程從fork()函數返回的值是0。


2、execve()加載進程

因爲系統調用的具體細節上面也有分析過了,這裏就不再重複,直接走到execve()在kernel中對應的處理函數。

linux-4.10/include/uapi/asm-generic/unistd.h

607  #define __NR_execve 221
608  __SC_COMP(__NR_execve, sys_execve, compat_sys_execve)

和sys_clone()一樣,對應的實現函數用宏包了起來,在  linux-4.10/fs/exec.c

887  SYSCALL_DEFINE3(execve,
1888  		const char __user *, filename,
1889  		const char __user *const __user *, argv,
1890  		const char __user *const __user *, envp)
1891  {
1892  	return do_execve(getname(filename), argv, envp);
1893  }

接着跑到do_execve()函數中

int do_execve(struct filename *filename,
1806  	const char __user *const __user *__argv,
1807  	const char __user *const __user *__envp)
1808  {
1809  	struct user_arg_ptr argv = { .ptr.native = __argv };
1810  	struct user_arg_ptr envp = { .ptr.native = __envp };
1811  	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
1812  }

關鍵的實現就在do_execveat_common()這個函數裏面了

/*
1657   * sys_execve() executes a new program.
1658   */
1659  static int do_execveat_common(int fd, struct filename *filename,
1660  			      struct user_arg_ptr argv,
1661  			      struct user_arg_ptr envp,
1662  			      int flags)
1663  {
1664  	char *pathbuf = NULL;
1665  	struct linux_binprm *bprm;
1666  	struct file *file;
1667  	struct files_struct *displaced;
1668  	int retval;
        ...
1693  	retval = -ENOMEM;
1694  	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); //申請一個linux_binprm 結構體
        ...
1705  	file = do_open_execat(fd, filename, flags);  //打開可執行文件
1706  	retval = PTR_ERR(file);
1707  	if (IS_ERR(file))
1708  		goto out_unmark;
1709  
1710  	sched_exec();
1711  
        ...
1736  	retval = bprm_mm_init(bprm); //創建進程的內存地址空間
        ...
1748  	retval = prepare_binprm(bprm);  //讀取可執行文件前面的128字節
        ...
1767  	retval = exec_binprm(bprm);   //加載程序到內存並運行
         ...
1771  	/* execve succeeded */
1772  	current->fs->in_exec = 0;
1773  	current->in_execve = 0;
1774  	acct_update_integrals(current);
1775  	task_numa_free(current);
1776  	free_bprm(bprm);
1777  	kfree(pathbuf);
1778  	putname(filename);
1779  	if (displaced)
1780  		put_files_struct(displaced);
1781  	return retval;
        ...
1803  }
我們重點看一下exec_binprm()
1634  static int exec_binprm(struct linux_binprm *bprm)
1635  {
1636  	pid_t old_pid, old_vpid;
1637  	int ret;
1638  
1639  	/* Need to fetch pid before load_binary changes it */
1640  	old_pid = current->pid;
1641  	rcu_read_lock();
1642  	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
1643  	rcu_read_unlock();
1644  
1645  	ret = search_binary_handler(bprm);  //嘗試加載該可執行程序
1646  	if (ret >= 0) {
1647  		audit_bprm(bprm);
1648  		trace_sched_process_exec(current, old_pid, bprm);
1649  		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1650  		proc_exec_connector(current);
1651  	}
1652  
1653  	return ret;
1654  }

1581  int search_binary_handler(struct linux_binprm *bprm)
1582  {
1583  	bool need_retry = IS_ENABLED(CONFIG_MODULES);
1584  	struct linux_binfmt *fmt;
1585  	int retval;
        ...
1595  	retval = -ENOENT;
1596   retry:
1597  	read_lock(&binfmt_lock);
1598  	list_for_each_entry(fmt, &formats, lh) {
1599  		if (!try_module_get(fmt->module))
1600  			continue;
1601  		read_unlock(&binfmt_lock);
1602  		bprm->recursion_depth++;
1603  		retval = fmt->load_binary(bprm);  //這裏纔是真正調用到handler 加載
1604  		read_lock(&binfmt_lock);
1605  		put_binfmt(fmt);
1606  		bprm->recursion_depth--;
1607  		if (retval < 0 && !bprm->mm) {
1608  			/* we got to flush_old_exec() and failed after it */
1609  			read_unlock(&binfmt_lock);
1610  			force_sigsegv(SIGSEGV, current);
1611  			return retval;
1612  		}
1613  		if (retval != -ENOEXEC || !bprm->file) {
1614  			read_unlock(&binfmt_lock);
1615  			return retval;
1616  		}
1617  	}
        ... 
1630  	return retval;
1631  }
search_binary_handler()會嘗試調用註冊的handler 去加載程序

 linux-4.10/fs/exec.c

72  static LIST_HEAD(formats);
73  static DEFINE_RWLOCK(binfmt_lock);
74  
75  void __register_binfmt(struct linux_binfmt * fmt, int insert)
76  {
77  	BUG_ON(!fmt);
78  	if (WARN_ON(!fmt->load_binary))
79  		return;
80  	write_lock(&binfmt_lock);
81  	insert ? list_add(&fmt->lh, &formats) :
82  		 list_add_tail(&fmt->lh, &formats);
83  	write_unlock(&binfmt_lock);
84  }
exec.c中創建了formats 對應的鏈表結構體,並提供了註冊的方法

linux-4.10/include/linux/binfmts.h

86  /* Registration of default binfmt handlers */
87  static inline void register_binfmt(struct linux_binfmt *fmt)
88  {
89  	__register_binfmt(fmt, 0);
90  }
91  /* Same as above, but adds a new binfmt at the top of the list */
92  static inline void insert_binfmt(struct linux_binfmt *fmt)
93  {
94  	__register_binfmt(fmt, 1);
95  }

linux-4.10/fs/binfmt_elf.c

2326  static int __init init_elf_binfmt(void)
2327  {
2328  	register_binfmt(&elf_format);
2329  	return 0;
2330  }
在elf 初始化函數裏面把它註冊了進來

84  static struct linux_binfmt elf_format = {
85  	.module		= THIS_MODULE,
86  	.load_binary	= load_elf_binary,
87  	.load_shlib	= load_elf_library,
88  	.core_dump	= elf_core_dump,
89  	.min_coredump	= ELF_EXEC_PAGESIZE,
90  };
所以上面的   retval = fmt->load_binary(bprm);  對應的是 load_elf_binary()函數

668  static int load_elf_binary(struct linux_binprm *bprm)
669  {
        ...
696  	/* Get the exec-header */
697  	loc->elf_ex = *((struct elfhdr *)bprm->buf);  //可執行程序文件頭的128字節
698     //下面就是根據文件頭來判斷是否加載
699  	retval = -ENOEXEC;
700  	/* First of all, some simple consistency checks */
701  	if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
702  		goto out;
703  
704  	if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
705  		goto out;
706  	if (!elf_check_arch(&loc->elf_ex))
707  		goto out;
         ...  //加載過程省略
1090  	start_thread(regs, elf_entry, bprm->p); //加載完成之後跑進程裏面的代碼
1091  	retval = 0;
1092  out:
1093  	kfree(loc);
1094  out_ret:
1095  	return retval;
1108  }

加載的過程這裏就不細講了,這個過程中會解析elf 的一些段,並執行一些初始化操作,也就是在程序中的main()函數還沒跑起來之前就已經跑了很多其他的代碼,後面會介紹elf文件格式。




發佈了62 篇原創文章 · 獲贊 37 · 訪問量 13萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章