小明哥學linux內核之進程(一)

目錄

進程

1. 進程描述符

thread_info

進程名

進程運行狀態

進程標識符

進程親屬關係

進程的內存空間

2. 進程的建立

_do_fork工作流程

0號進程的建立

3. 進程切換

4. 進程調試技巧

參考文獻


進程

進程在linux 裏也叫任務。

對於user space來講,進程是程序執行時的一個實例。可以把進程比作一個人類個體:只有一個父進程,可以有多個子進程;有或多或少的有效生命;承擔一定的功能或任務;擁有不同的狀態,且任一時刻有且只有一種狀態;最終都會消亡。

對於kernel space來講,進程的目的就是合理的分配系統資源的實體。每個進程都會通過內核獲取自己的CPU、內存等系統資源,從而執行任務。

1. 進程描述符

由於進程是程序執行時的一個實例,故除了程序指令之外,還包含了大量的資源,擁有獨立的虛擬地址空間。爲了更好的管理進程,內核必須清楚的瞭解每個進程當前的狀態以及所做的事情。從而,就有了進程描述符這個概念。Linux中,使用task_struct結構來描述一個進程。

進程描述符都是task_struct類型結構,包含所有與進程相關的信息。

ubuntu系統代碼位置(Linux version 4.15):/usr/src/linux-headers-4.15.0-72/include/linux/sched.h

linux在線代碼位置:https://elixir.bootlin.com/linux/v5.5.2/source/include/linux/sched.h

由於task_struct結構體非常龐大複雜,這裏主要列舉主要結構:

struct task_struct {
        //thread_info信息
        /*
         * For reasons of header soup (see current_thread_info()), this
         * must be the first element of task_struct.
         */
        struct thread_info      thread_info;
        //end

        //進程運行狀態
	/* -1 unrunnable, 0 runnable, >0 stopped: */
	volatile long			state;
        //end

        //內核態堆棧
	void				*stack;
        //end

        //進程狀態信息
	/* Per task flags (PF_*), defined further below: */
	unsigned int			flags;
        //end

        //進程優先級信息
	int				prio;//動態優先級
	int				static_prio;//靜態優先級,可用nice直接修改
	int				normal_prio;//靜態優先級和調度策略
	unsigned int			rt_priority;//實時優先級
        //end

        //進程內存信息
	struct mm_struct		*mm;
	struct mm_struct		*active_mm;

	/* Per-thread vma caching: */
	struct vmacache			vmacache;

#ifdef SPLIT_RSS_COUNTING
	struct task_rss_stat		rss_stat;
#endif
        //end

        //進程退出狀態
	int				exit_state;
	int				exit_code;
	int				exit_signal;
	/* The signal sent when the parent dies: */
	int				pdeath_signal;
        //end

        //原子訪問標誌
	unsigned long			atomic_flags; /* Flags requiring atomic access. */
        //end

        //進程標識符
	pid_t				pid;
	pid_t				tgid;
        //end

        //進程的親屬關係
	/*
	 * Pointers to the (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with
	 * p->real_parent->pid)
	 */

	/* Real parent process: */
	struct task_struct __rcu	*real_parent; //真實的父進程
	/* Recipient of SIGCHLD, wait4() reports: */
	struct task_struct __rcu	*parent; //形式上的父進程,一般都是real_parent。

	/*
	 * Children/sibling form the list of natural children:
	 */
	struct list_head		children;
	struct list_head		sibling;
	struct task_struct		*group_leader;
        //end

        //時間統計信息
	u64				utime;
	u64				stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
	u64				utimescaled;
	u64				stimescaled;
#endif
	u64				gtime;
	struct prev_cputime		prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
	struct vtime			vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
	atomic_t			tick_dep_mask;
#endif
	/* Context switch counts: */
	unsigned long			nvcsw;
	unsigned long			nivcsw;

	/* Monotonic time in nsecs: */
	u64				start_time;

	/* Boot based time in nsecs: */
	u64				real_start_time;

	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
	unsigned long			min_flt;
	unsigned long			maj_flt;

#ifdef CONFIG_POSIX_TIMERS
	struct task_cputime		cputime_expires;
	struct list_head		cpu_timers[3];
#endif
        //end

        //進程名,最長15個字符
	/*
	 * executable name, excluding path.
	 *
	 * - normally initialized setup_new_exec()
	 * - access it with [gs]et_task_comm()
	 * - lock it with task_lock()
	 */
	char				comm[TASK_COMM_LEN];
        //end

	/* Filesystem information: */
	struct fs_struct		*fs;

	/* Open file information: */
	struct files_struct		*files;

	/* Namespaces: */
	struct nsproxy			*nsproxy;

	/* Signal handlers: */
	struct signal_struct		*signal;
	struct sighand_struct		*sighand;
	sigset_t			blocked;
	sigset_t			real_blocked;
	/* Restored if set_restore_sigmask() was used: */
	sigset_t			saved_sigmask;
	struct sigpending		pending;
	unsigned long			sas_ss_sp;
	size_t				sas_ss_size;
	unsigned int			sas_ss_flags;

	/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
	spinlock_t			alloc_lock;

	/*
	 * WARNING: on x86, 'thread_struct' contains a variable-sized
	 * structure.  It *MUST* be at the end of 'task_struct'.
	 *
	 * Do not put anything below here!
	 */
};

thread_info

thread_info的定義在arch目錄下,不同的架構有不同的thread_info。

以arm架構爲例:arch/arm/include/asm/thread_info.h。其中的task屬性就是指向task_struct。

thread_info保存了彙編代碼段需要訪問的那部分進程的數據,arm架構在thread_info中嵌入指向task_struct的指針, 則我們可以很方便的通過thread_info來查找task_struct。不同的體系架構擁有不同的thread_info。

/*
 * low level task data that entry.S needs immediate access to.
 * __switch_to() assumes cpu_context follows immediately after cpu_domain.
 */
struct thread_info {
	unsigned long		flags;		/* low level flags */
	int			preempt_count;	/* 0 => preemptable, <0 => bug */
	mm_segment_t		addr_limit;	/* address limit */
	struct task_struct	*task;		/* main task structure */
	__u32			cpu;		/* cpu */
	__u32			cpu_domain;	/* cpu domain */
#ifdef CONFIG_STACKPROTECTOR_PER_TASK
	unsigned long		stack_canary;
#endif
	struct cpu_context_save	cpu_context;	/* cpu context */
	__u32			syscall;	/* syscall number */
	__u8			used_cp[16];	/* thread used copro */
	unsigned long		tp_value[2];	/* TLS registers */
#ifdef CONFIG_CRUNCH
	struct crunch_state	crunchstate;
#endif
	union fp_state		fpstate __attribute__((aligned(8)));
	union vfp_state		vfpstate;
#ifdef CONFIG_ARM_THUMBEE
	unsigned long		thumbee_state;	/* ThumbEE Handler Base register */
#endif
};

 另外,在arch/arm64/include/asm/thread_info.h 中,thread_info結構體非常簡短。由於在linux4.9之後的版本,將thread_info結構體中指向的task_struct的指針存放到sp_el0中。這麼做的目的是可以防止thread_info在堆棧溢出時損壞,從而使許多攻擊更加困難。

/*
 * low level task data that entry.S needs immediate access to.
 */
struct thread_info {
	unsigned long		flags;		/* low level flags */
	mm_segment_t		addr_limit;	/* address limit */
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
	u64			ttbr0;		/* saved TTBR0_EL1 */
#endif
	union {
		u64		preempt_count;	/* 0 => preemptible, <0 => bug */
		struct {
#ifdef CONFIG_CPU_BIG_ENDIAN
			u32	need_resched;
			u32	count;
#else
			u32	count;
			u32	need_resched;
#endif
		} preempt;
	};
};

進程名

/* Task command name length: */
#define TASK_COMM_LEN			16

/*
 * executable name, excluding path.
 *
 * - normally initialized setup_new_exec()
 * - access it with [gs]et_task_comm()
 * - lock it with task_lock()
 */
char				comm[TASK_COMM_LEN];

comm是進程名字,長度不超過15個字符。

進程運行狀態

/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */
#define TASK_RUNNING		0x0000 //可運行狀態,要麼正在運行,要麼準備運行
#define TASK_INTERRUPTIBLE	0x0001 //可中斷的等待狀態
#define TASK_UNINTERRUPTIBLE	0x0002 //不可中斷的等待狀態
#define __TASK_STOPPED		0x0004 //暫停狀態
#define __TASK_TRACED		0x0008 //跟蹤狀態
/* in tsk->exit_state */
#define EXIT_DEAD		0x0010 //進程死亡狀態
#define EXIT_ZOMBIE		0x0020 //進程僵死狀態
#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
/* in tsk->state again */
#define TASK_DEAD		0x0040 //進程死亡過程中,異常死亡-->EXIT_ZOMBIE;正常死亡-->EXIT_DEAD
#define TASK_WAKEKILL		0x0080 //喚醒並殺死的進程
#define TASK_WAKING		0x0100 //喚醒的進程
#define TASK_PARKED		0x0200 //
#define TASK_NOLOAD		0x0400
#define TASK_NEW		0x0800
#define TASK_STATE_MAX		0x1000

進程標識符

//進程標識符
pid_t				pid;
pid_t				tgid;

 就是通俗易懂的pid和tgid。

pid是進程id,fork的時候會創建出一個唯一的pid。

tgid是線程組id,處於相同的線程組中的所有進程都有相同的TGID;線程組組長的TGID與其PID相同;一個進程沒有使用線程,則其TGID與PID也相同。

進程親屬關係

/*
 * Pointers to the (original) parent process, youngest child, younger sibling,
 * older sibling, respectively.  (p->father can be replaced with
 * p->real_parent->pid)
 */

/* Real parent process: */
struct task_struct __rcu	*real_parent;

/* Recipient of SIGCHLD, wait4() reports: */
struct task_struct __rcu	*parent;

/*
 * Children/sibling form the list of natural children:
 */
struct list_head		children;
struct list_head		sibling;
struct task_struct		*group_leader;

其中,real_parent是指向父進程的指針,是fork自己的進程;parent是形式上的父進程,是trace自己的進程。一般情況下,parent就是real_parent。退出時,將wait4() 上報給parent。

進程的內存空間

每一個用戶態進程都有獨立的3GB虛擬空間,所有的用戶態進程共享1GB的內核態空間。關於虛擬內存的部分,後續會在內存管理部分整理。對於一個進程來說,task_struct的mm 和active_mm 用來描述進程的內存信息的。其中,由於內核線程沒有3G的用戶虛擬空間,所以所有內核進程的mm成員爲NULL。但是,由於頁目錄的地址是保存在mm中的,從其他進程切換到內核態線程時,調度器需要切換頁表,因此增加了active_mm。對於mm爲NULL的內核態線程,就借用其他進程的mm。

也就是說,內核態線程的mm是NULL,active_mm是其他進程的mm。因此,當進行進程切換時,統一使用active_mm就可以了。由於所有進程共享1GB的內核態空間,因此不會切換異常。

struct mm_struct		*mm;
struct mm_struct		*active_mm;

2. 進程的建立

創建一個進程時,子進程會把父進程的一切資源複製過來。系統調用fork()或者clone()來創建一個進程。對於kernel來說,無論怎樣創建一個進程,最核心的都是調用_do_fork()。

在linux4.2 之前的版本,系統創建進程都是使用do_fork()。linux4.2之後的版本添加了對CLONE_SETTLS的支持。

代碼位置在kernel/fork.c中。

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long _do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr,
	      unsigned long tls)
{
	……
	return nr;
}

clone_flags指定子進程結束時,需要向父進程發送的信號,通常這個信號是SIGCHLD。clone_flags可選參數如下:

/*
 * cloning flags:
 */
#define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
#define CLONE_VM	0x00000100	/* set if VM shared between processes */
#define CLONE_FS	0x00000200	/* set if fs info shared between processes */
#define CLONE_FILES	0x00000400	/* set if open files shared between processes */
#define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
#define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
#define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
#define CLONE_THREAD	0x00010000	/* Same thread group? */
#define CLONE_NEWNS	0x00020000	/* New mount namespace group */
#define CLONE_SYSVSEM	0x00040000	/* share system V SEM_UNDO semantics */
#define CLONE_SETTLS	0x00080000	/* create a new TLS for the child */
#define CLONE_PARENT_SETTID	0x00100000	/* set the TID in the parent */
#define CLONE_CHILD_CLEARTID	0x00200000	/* clear the TID in the child */
#define CLONE_DETACHED		0x00400000	/* Unused, ignored */
#define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
#define CLONE_NEWCGROUP		0x02000000	/* New cgroup namespace */
#define CLONE_NEWUTS		0x04000000	/* New utsname namespace */
#define CLONE_NEWIPC		0x08000000	/* New ipc namespace */
#define CLONE_NEWUSER		0x10000000	/* New user namespace */
#define CLONE_NEWPID		0x20000000	/* New pid namespace */
#define CLONE_NEWNET		0x40000000	/* New network namespace */
#define CLONE_IO		0x80000000	/* Clone io context */

stack_start 子進程用戶態堆棧起始地址。通常設置爲0,父進程會複製自己的堆棧指針,當子進程對堆棧進行寫入時,缺頁中斷處理程序會設置新的物理頁面。

stack_size 和tls 都是在copy_thread_tls中使用,一般被設置爲0。

parent_tidptr 用戶態內存指針,當CLONE_PARENT_SETTID被設置時,內核會把新建立的子進程ID通過parent_tidptr返回。

child_tidptr 用戶態內存指針,當CLONE_CHILD_SETTID被設置時,內核會把新建立的子進程ID通過child_tidptr返回。

_do_fork工作流程

_do_fork的基本工作流程如下:

_do_fork(){
        struct task_struct *p;                          
        long nr;
        //1. 爲子進程分配task_struct結構,複製父進程的資源,初始化子進程自己的資源
        p = copy_process
                struct task_struct *p;
                dup_task_struct//爲子進程分配一個task_struct和內核態堆棧,把父進程的task_struct結構複製到子進程,同時設置thread_info
                        struct task_struct *tsk;
                        node = tsk_fork_get_node(orig); //分配node
                        tsk = alloc_task_struct_node(node); //分配task_struct
                        stack = alloc_thread_stack_node(tsk, node); //分配thread_info和stack
                        stack_vm_area = task_stack_vm_area(tsk);//分配虛擬stack
                        return tsk;
                p->xxx = xxx //初始化子進程的各成員值
                alloc_pid & pid_nr //分配pid
                return p;
        //2. 獲取pid
        pid = get_task_pid(p, PIDTYPE_PID);
                get_pid
        nr = pid_vnr(pid);
                pid_nr_ns
        //3. 將子進程加入到進程隊列中,準備執行
        wake_up_new_task(p)
        return nr;
}

_do_fork主要工作就是爲子進程分配task_struct結構,包括內存和堆棧信息;然後將父進程的資源複製給子進程;然後初始化子進程自己的成員和資源;最後再講子進程加入到進程隊列中,準備執行。

_do_fork函數的主要工作都是在copy_process中進行的。copy_process完成的工作主要包括:必要的檢查,複製必要的數據結構,初始化子進程等。其中,衆多copy_xxx 函數對應複製xxx資源。

copy_process代碼如下:

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static __latent_entropy struct task_struct *copy_process(
					struct pid *pid,
					int trace,
					int node,
					struct kernel_clone_args *args)
{
	struct task_struct *p;

        //大量的初始化錯誤檢查
        //……

        //父進程資源複製給子進程
	p = dup_task_struct(current, node);

	//大量的p->xxx = xxx; copy_xxx()。用作子進程自己的成員初始化
        //……

        //分配一個新的pid
	if (pid != &init_struct_pid) {
		pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
				args->set_tid_size);
		if (IS_ERR(pid)) {
			retval = PTR_ERR(pid);
			goto bad_fork_cleanup_thread;
		}
	}

        //……

	return p;

//大量的錯誤分支
//……
ERR:
        return ERR_PTR(retval);
}

0號進程的建立

上述fork過程,描述了一個父進程建立子進程的過程。但是第一個進程又是如何建立的呢?第一個進程是在內核啓動時直接建立的,一般稱爲0號進程,也就是swapper進程。INIT_TASK代碼位置在include/linux/init_task.h 中,其定義如下:

/*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x1fffff (=2MB)
 */
#define INIT_TASK(tsk)	\
{									\
	INIT_TASK_TI(tsk)						\
	.state		= 0,						\
	.stack		= init_stack,					\
	.usage		= ATOMIC_INIT(2),				\
	.flags		= PF_KTHREAD,					\
	.prio		= MAX_PRIO-20,					\
	.static_prio	= MAX_PRIO-20,					\
	.normal_prio	= MAX_PRIO-20,					\
	.policy		= SCHED_NORMAL,					\
	.cpus_allowed	= CPU_MASK_ALL,					\
	.nr_cpus_allowed= NR_CPUS,					\
	.mm		= NULL,						\
	.active_mm	= &init_mm,					\
	.restart_block = {						\
		.fn = do_no_restart_syscall,				\
	},								\
	.se		= {						\
		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
	},								\
	.rt		= {						\
		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
		.time_slice	= RR_TIMESLICE,				\
	},								\
	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
	INIT_PUSHABLE_TASKS(tsk)					\
	INIT_CGROUP_SCHED(tsk)						\
	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
	.real_parent	= &tsk,						\
	.parent		= &tsk,						\
	.children	= LIST_HEAD_INIT(tsk.children),			\
	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
	.group_leader	= &tsk,						\
	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
	.comm		= INIT_TASK_COMM,				\
	.thread		= INIT_THREAD,					\
	.fs		= &init_fs,					\
	.files		= &init_files,					\
	.signal		= &init_signals,				\
	.sighand	= &init_sighand,				\
	.nsproxy	= &init_nsproxy,				\
	.pending	= {						\
		.list = LIST_HEAD_INIT(tsk.pending.list),		\
		.signal = {{0}}},					\
	.blocked	= {{0}},					\
	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
	.journal_info	= NULL,						\
	INIT_CPU_TIMERS(tsk)						\
	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),	\
	.timer_slack_ns = 50000, /* 50 usec default slack */		\
	.pids = {							\
		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
	},								\
	.thread_group	= LIST_HEAD_INIT(tsk.thread_group),		\
	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),	\
	INIT_IDS							\
	INIT_PERF_EVENTS(tsk)						\
	INIT_TRACE_IRQFLAGS						\
	INIT_LOCKDEP							\
	INIT_FTRACE_GRAPH						\
	INIT_TRACE_RECURSION						\
	INIT_TASK_RCU_PREEMPT(tsk)					\
	INIT_TASK_RCU_TASKS(tsk)					\
	INIT_CPUSET_SEQ(tsk)						\
	INIT_RT_MUTEXES(tsk)						\
	INIT_PREV_CPUTIME(tsk)						\
	INIT_VTIME(tsk)							\
	INIT_NUMA_BALANCING(tsk)					\
	INIT_KASAN(tsk)							\
	INIT_LIVEPATCH(tsk)						\
	INIT_TASK_SECURITY						\
}

init_task的各種資源都是通過INIT_TASK初始化的。在start_kernel中由rest_init函數調用kernel_thread函數,從而以0號進程爲模板建立了kernel_init進程。之後,kernel_init 會建立init進程。從而把啓動過程傳遞到用戶態。

//source 路徑:init/init_task.c
/* Initial task structure */
struct task_struct init_task = INIT_TASK(init_task);
EXPORT_SYMBOL(init_task);

//source 路徑:init/main.c
asmlinkage __visible void __init start_kernel(void)
{
    //……
    rest_init();
}
​
static noinline void __ref rest_init(void)
{
	struct task_struct *tsk;
	int pid;

	rcu_scheduler_starting();
	/*
	 * We need to spawn init first so that it obtains pid 1, however
	 * the init task will end up wanting to create kthreads, which, if
	 * we schedule it before we create kthreadd, will OOPS.
	 */
	pid = kernel_thread(kernel_init, NULL, CLONE_FS);
	/*
	 * Pin init on the boot CPU. Task migration is not properly working
	 * until sched_init_smp() has been run. It will set the allowed
	 * CPUs for init to the non isolated CPUs.
	 */
	rcu_read_lock();
	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
	set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
	rcu_read_unlock();

	numa_default_policy();
	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
	rcu_read_lock();
	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
	rcu_read_unlock();

	/*
	 * Enable might_sleep() and smp_processor_id() checks.
	 * They cannot be enabled earlier because with CONFIG_PRREMPT=y
	 * kernel_thread() would trigger might_sleep() splats. With
	 * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
	 * already, but it's stuck on the kthreadd_done completion.
	 */
	system_state = SYSTEM_SCHEDULING;

	complete(&kthreadd_done);

	/*
	 * The boot idle thread must execute schedule()
	 * at least once to get things moving:
	 */
	schedule_preempt_disabled();
	/* Call into cpu_idle with preempt disabled */
	cpu_startup_entry(CPUHP_ONLINE);
}

3. 進程切換

當前運行的進程可能主動或者被動的放棄CPU,此時就需要發生進程切換。進程切換的主要工作是在context_switch中完成的。

當前進程可能由於進程執行結束,主動放棄CPU;可能由於進程阻塞,主動放棄CPU;可能由於時間片耗盡,被動放棄CPU;可能由於更高優先級的進程搶佔CPU,被動放棄CPU等。

從contex_switch源碼中可以看到,mm爲空的話,說明next進程是內核態線程,這時候,只能借用prev進程當前正在使用的那個地址空間(oldmm = prev->active_mm)。注意:這裏不能借用prev進程的地址空間(prev->mm),因爲prev進程也可能是一個內核態線程,mm也是NULL。

/*
 * context_switch - switch to the new MM and the new thread's register state.
 */
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next, struct rq_flags *rf)
{
	struct mm_struct *mm, *oldmm;

	prepare_task_switch(rq, prev, next);

	mm = next->mm;
	oldmm = prev->active_mm;
	/*
	 * For paravirt, this is coupled with an exit in switch_to to
	 * combine the page table reload and the switch backend into
	 * one hypercall.
	 */
	arch_start_context_switch(prev);

	if (!mm) {
		next->active_mm = oldmm;
		mmgrab(oldmm);
		enter_lazy_tlb(oldmm, next);
	} else
		switch_mm_irqs_off(oldmm, mm, next);

	if (!prev->mm) {
		prev->active_mm = NULL;
		rq->prev_mm = oldmm;
	}

	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);

	/*
	 * Since the runqueue lock will be released by the next
	 * task (which is an invalid locking op but in the case
	 * of the scheduler it's an obvious special-case), so we
	 * do an early lockdep release here:
	 */
	rq_unpin_lock(rq, rf);
	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

	/* Here we just switch the register state and the stack. */
	switch_to(prev, next, prev);
	barrier();

	return finish_task_switch(prev);
}

在contex_switch代碼中,前半部分代碼都是在準備合理的next和prev。真正執行上下文切換的是switch_to函數。

以arm64架構爲例,switch_to函數是一個內嵌彙編函數,source 位置在./arch/arm64/kernel/process.c。

/*
 * Thread switching.
 */
__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
                struct task_struct *next)
{
    struct task_struct *last;

    fpsimd_thread_switch(next);
    tls_thread_switch(next);
    hw_breakpoint_thread_switch(next);
    contextidr_thread_switch(next);
    entry_task_switch(next);
    uao_thread_switch(next);

    /*
     * Complete any pending TLB or cache maintenance on this CPU in case
     * the thread migrates to a different CPU.
     * This full barrier is also required by the membarrier system
     * call.
     */
    dsb(ish);

    /* the actual thread switch */
    last = cpu_switch_to(prev, next);

    return last;
}

其中,cpu_switch_to函數是個彙編函數。代碼位置在 arch/arm64/kernel/entry.S

/*
 * Register switch for AArch64. The callee-saved registers need to be saved
 * and restored. On entry:
 *   x0 = previous task_struct (must be preserved across the switch)
 *   x1 = next task_struct
 * Previous and next are guaranteed not to be the same.
 *
 */
ENTRY(cpu_switch_to)
        //獲取THREAD_CPU_CONTEXT並保存到x10中
	mov	x10, #THREAD_CPU_CONTEXT
        //將prev task和THREAD_CPU_CONTEXT相加,放到x8中
	add	x8, x0, x10
        //當前的堆棧指針保存到x9中
	mov	x9, sp
        //保存prev的現場
	stp	x19, x20, [x8], #16		// store callee-saved registers
	stp	x21, x22, [x8], #16
	stp	x23, x24, [x8], #16
	stp	x25, x26, [x8], #16
	stp	x27, x28, [x8], #16
	stp	x29, x9, [x8], #16
	str	lr, [x8]
        //switch到next的現場
	add	x8, x1, x10
	ldp	x19, x20, [x8], #16		// restore callee-saved registers
	ldp	x21, x22, [x8], #16
	ldp	x23, x24, [x8], #16
	ldp	x25, x26, [x8], #16
	ldp	x27, x28, [x8], #16
	ldp	x29, x9, [x8], #16
	ldr	lr, [x8]
        //恢復堆棧
	mov	sp, x9
        //將next 保存到sp_el0中,防止因溢出而造成的安全隱患。同時,也在thread_info結構體中移除task 成員
	msr	sp_el0, x1
        //返回到__switch_to
	ret
ENDPROC(cpu_switch_to)
NOKPROBE(cpu_switch_to)

4. 進程調試技巧

linux內核定義了一個全局變量current用來獲取當前進程的task_struct。由於不同體系結構對current的獲取不同,本文以arm64爲例。代碼如下:./arch/arm64/include/asm/current.h

static __always_inline struct task_struct *get_current(void)
{
	unsigned long sp_el0;
	asm ("mrs %0, sp_el0" : "=r" (sp_el0));
	return (struct task_struct *)sp_el0;
}

#define current get_current()

代碼非常簡潔,就是取sp_el0存的內容,就是當前進程的task_struct。

比較常用的log輸出:

printk(KERN_INFO "name is %s; pid=%i, tgid=%i, mm=%p, active_mm=%p\n",
	current->comm, current->pid,current->tgid,current->mm,current->active_mm);

 輸出結果如下:

參考文獻

《獨闢蹊徑品內核:Linux內核源代碼導讀》

https://www.cnblogs.com/sky-heaven/p/8080265.html

https://blog.csdn.net/zdy0_2004/article/details/54871189

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章