目錄
進程
進程在linux 裏也叫任務。
對於user space來講,進程是程序執行時的一個實例。可以把進程比作一個人類個體:只有一個父進程,可以有多個子進程;有或多或少的有效生命;承擔一定的功能或任務;擁有不同的狀態,且任一時刻有且只有一種狀態;最終都會消亡。
對於kernel space來講,進程的目的就是合理的分配系統資源的實體。每個進程都會通過內核獲取自己的CPU、內存等系統資源,從而執行任務。
1. 進程描述符
由於進程是程序執行時的一個實例,故除了程序指令之外,還包含了大量的資源,擁有獨立的虛擬地址空間。爲了更好的管理進程,內核必須清楚的瞭解每個進程當前的狀態以及所做的事情。從而,就有了進程描述符這個概念。Linux中,使用task_struct結構來描述一個進程。
進程描述符都是task_struct類型結構,包含所有與進程相關的信息。
ubuntu系統代碼位置(Linux version 4.15):/usr/src/linux-headers-4.15.0-72/include/linux/sched.h
linux在線代碼位置:https://elixir.bootlin.com/linux/v5.5.2/source/include/linux/sched.h
由於task_struct結構體非常龐大複雜,這裏主要列舉主要結構:
struct task_struct {
//thread_info信息
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info thread_info;
//end
//進程運行狀態
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
//end
//內核態堆棧
void *stack;
//end
//進程狀態信息
/* Per task flags (PF_*), defined further below: */
unsigned int flags;
//end
//進程優先級信息
int prio;//動態優先級
int static_prio;//靜態優先級,可用nice直接修改
int normal_prio;//靜態優先級和調度策略
unsigned int rt_priority;//實時優先級
//end
//進程內存信息
struct mm_struct *mm;
struct mm_struct *active_mm;
/* Per-thread vma caching: */
struct vmacache vmacache;
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
//end
//進程退出狀態
int exit_state;
int exit_code;
int exit_signal;
/* The signal sent when the parent dies: */
int pdeath_signal;
//end
//原子訪問標誌
unsigned long atomic_flags; /* Flags requiring atomic access. */
//end
//進程標識符
pid_t pid;
pid_t tgid;
//end
//進程的親屬關係
/*
* Pointers to the (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
/* Real parent process: */
struct task_struct __rcu *real_parent; //真實的父進程
/* Recipient of SIGCHLD, wait4() reports: */
struct task_struct __rcu *parent; //形式上的父進程,一般都是real_parent。
/*
* Children/sibling form the list of natural children:
*/
struct list_head children;
struct list_head sibling;
struct task_struct *group_leader;
//end
//時間統計信息
u64 utime;
u64 stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
u64 utimescaled;
u64 stimescaled;
#endif
u64 gtime;
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtime vtime;
#endif
#ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
#endif
/* Context switch counts: */
unsigned long nvcsw;
unsigned long nivcsw;
/* Monotonic time in nsecs: */
u64 start_time;
/* Boot based time in nsecs: */
u64 real_start_time;
/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
unsigned long min_flt;
unsigned long maj_flt;
#ifdef CONFIG_POSIX_TIMERS
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
#endif
//end
//進程名,最長15個字符
/*
* executable name, excluding path.
*
* - normally initialized setup_new_exec()
* - access it with [gs]et_task_comm()
* - lock it with task_lock()
*/
char comm[TASK_COMM_LEN];
//end
/* Filesystem information: */
struct fs_struct *fs;
/* Open file information: */
struct files_struct *files;
/* Namespaces: */
struct nsproxy *nsproxy;
/* Signal handlers: */
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */
sigset_t saved_sigmask;
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned int sas_ss_flags;
/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
spinlock_t alloc_lock;
/*
* WARNING: on x86, 'thread_struct' contains a variable-sized
* structure. It *MUST* be at the end of 'task_struct'.
*
* Do not put anything below here!
*/
};
thread_info
thread_info的定義在arch目錄下,不同的架構有不同的thread_info。
以arm架構爲例:arch/arm/include/asm/thread_info.h。其中的task屬性就是指向task_struct。
thread_info保存了彙編代碼段需要訪問的那部分進程的數據,arm架構在thread_info中嵌入指向task_struct的指針, 則我們可以很方便的通過thread_info來查找task_struct。不同的體系架構擁有不同的thread_info。
/*
* low level task data that entry.S needs immediate access to.
* __switch_to() assumes cpu_context follows immediately after cpu_domain.
*/
struct thread_info {
unsigned long flags; /* low level flags */
int preempt_count; /* 0 => preemptable, <0 => bug */
mm_segment_t addr_limit; /* address limit */
struct task_struct *task; /* main task structure */
__u32 cpu; /* cpu */
__u32 cpu_domain; /* cpu domain */
#ifdef CONFIG_STACKPROTECTOR_PER_TASK
unsigned long stack_canary;
#endif
struct cpu_context_save cpu_context; /* cpu context */
__u32 syscall; /* syscall number */
__u8 used_cp[16]; /* thread used copro */
unsigned long tp_value[2]; /* TLS registers */
#ifdef CONFIG_CRUNCH
struct crunch_state crunchstate;
#endif
union fp_state fpstate __attribute__((aligned(8)));
union vfp_state vfpstate;
#ifdef CONFIG_ARM_THUMBEE
unsigned long thumbee_state; /* ThumbEE Handler Base register */
#endif
};
另外,在arch/arm64/include/asm/thread_info.h 中,thread_info結構體非常簡短。由於在linux4.9之後的版本,將thread_info結構體中指向的task_struct的指針存放到sp_el0中。這麼做的目的是可以防止thread_info在堆棧溢出時損壞,從而使許多攻擊更加困難。
/*
* low level task data that entry.S needs immediate access to.
*/
struct thread_info {
unsigned long flags; /* low level flags */
mm_segment_t addr_limit; /* address limit */
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
u64 ttbr0; /* saved TTBR0_EL1 */
#endif
union {
u64 preempt_count; /* 0 => preemptible, <0 => bug */
struct {
#ifdef CONFIG_CPU_BIG_ENDIAN
u32 need_resched;
u32 count;
#else
u32 count;
u32 need_resched;
#endif
} preempt;
};
};
進程名
/* Task command name length: */
#define TASK_COMM_LEN 16
/*
* executable name, excluding path.
*
* - normally initialized setup_new_exec()
* - access it with [gs]et_task_comm()
* - lock it with task_lock()
*/
char comm[TASK_COMM_LEN];
comm是進程名字,長度不超過15個字符。
進程運行狀態
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
*
* We have two separate sets of flags: task->state
* is about runnability, while task->exit_state are
* about the task exiting. Confusing, but this way
* modifying one set can't modify the other one by
* mistake.
*/
#define TASK_RUNNING 0x0000 //可運行狀態,要麼正在運行,要麼準備運行
#define TASK_INTERRUPTIBLE 0x0001 //可中斷的等待狀態
#define TASK_UNINTERRUPTIBLE 0x0002 //不可中斷的等待狀態
#define __TASK_STOPPED 0x0004 //暫停狀態
#define __TASK_TRACED 0x0008 //跟蹤狀態
/* in tsk->exit_state */
#define EXIT_DEAD 0x0010 //進程死亡狀態
#define EXIT_ZOMBIE 0x0020 //進程僵死狀態
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
/* in tsk->state again */
#define TASK_DEAD 0x0040 //進程死亡過程中,異常死亡-->EXIT_ZOMBIE;正常死亡-->EXIT_DEAD
#define TASK_WAKEKILL 0x0080 //喚醒並殺死的進程
#define TASK_WAKING 0x0100 //喚醒的進程
#define TASK_PARKED 0x0200 //
#define TASK_NOLOAD 0x0400
#define TASK_NEW 0x0800
#define TASK_STATE_MAX 0x1000
進程標識符
//進程標識符
pid_t pid;
pid_t tgid;
就是通俗易懂的pid和tgid。
pid是進程id,fork的時候會創建出一個唯一的pid。
tgid是線程組id,處於相同的線程組中的所有進程都有相同的TGID;線程組組長的TGID與其PID相同;一個進程沒有使用線程,則其TGID與PID也相同。
進程親屬關係
/*
* Pointers to the (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
/* Real parent process: */
struct task_struct __rcu *real_parent;
/* Recipient of SIGCHLD, wait4() reports: */
struct task_struct __rcu *parent;
/*
* Children/sibling form the list of natural children:
*/
struct list_head children;
struct list_head sibling;
struct task_struct *group_leader;
其中,real_parent是指向父進程的指針,是fork自己的進程;parent是形式上的父進程,是trace自己的進程。一般情況下,parent就是real_parent。退出時,將wait4() 上報給parent。
進程的內存空間
每一個用戶態進程都有獨立的3GB虛擬空間,所有的用戶態進程共享1GB的內核態空間。關於虛擬內存的部分,後續會在內存管理部分整理。對於一個進程來說,task_struct的mm 和active_mm 用來描述進程的內存信息的。其中,由於內核線程沒有3G的用戶虛擬空間,所以所有內核進程的mm成員爲NULL。但是,由於頁目錄的地址是保存在mm中的,從其他進程切換到內核態線程時,調度器需要切換頁表,因此增加了active_mm。對於mm爲NULL的內核態線程,就借用其他進程的mm。
也就是說,內核態線程的mm是NULL,active_mm是其他進程的mm。因此,當進行進程切換時,統一使用active_mm就可以了。由於所有進程共享1GB的內核態空間,因此不會切換異常。
struct mm_struct *mm;
struct mm_struct *active_mm;
2. 進程的建立
創建一個進程時,子進程會把父進程的一切資源複製過來。系統調用fork()或者clone()來創建一個進程。對於kernel來說,無論怎樣創建一個進程,最核心的都是調用_do_fork()。
在linux4.2 之前的版本,系統創建進程都是使用do_fork()。linux4.2之後的版本添加了對CLONE_SETTLS的支持。
代碼位置在kernel/fork.c中。
/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
{
……
return nr;
}
clone_flags指定子進程結束時,需要向父進程發送的信號,通常這個信號是SIGCHLD。clone_flags可選參數如下:
/*
* cloning flags:
*/
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
stack_start 子進程用戶態堆棧起始地址。通常設置爲0,父進程會複製自己的堆棧指針,當子進程對堆棧進行寫入時,缺頁中斷處理程序會設置新的物理頁面。
stack_size 和tls 都是在copy_thread_tls中使用,一般被設置爲0。
parent_tidptr 用戶態內存指針,當CLONE_PARENT_SETTID被設置時,內核會把新建立的子進程ID通過parent_tidptr返回。
child_tidptr 用戶態內存指針,當CLONE_CHILD_SETTID被設置時,內核會把新建立的子進程ID通過child_tidptr返回。
_do_fork工作流程
_do_fork的基本工作流程如下:
_do_fork(){
struct task_struct *p;
long nr;
//1. 爲子進程分配task_struct結構,複製父進程的資源,初始化子進程自己的資源
p = copy_process
struct task_struct *p;
dup_task_struct//爲子進程分配一個task_struct和內核態堆棧,把父進程的task_struct結構複製到子進程,同時設置thread_info
struct task_struct *tsk;
node = tsk_fork_get_node(orig); //分配node
tsk = alloc_task_struct_node(node); //分配task_struct
stack = alloc_thread_stack_node(tsk, node); //分配thread_info和stack
stack_vm_area = task_stack_vm_area(tsk);//分配虛擬stack
return tsk;
p->xxx = xxx //初始化子進程的各成員值
alloc_pid & pid_nr //分配pid
return p;
//2. 獲取pid
pid = get_task_pid(p, PIDTYPE_PID);
get_pid
nr = pid_vnr(pid);
pid_nr_ns
//3. 將子進程加入到進程隊列中,準備執行
wake_up_new_task(p)
return nr;
}
_do_fork主要工作就是爲子進程分配task_struct結構,包括內存和堆棧信息;然後將父進程的資源複製給子進程;然後初始化子進程自己的成員和資源;最後再講子進程加入到進程隊列中,準備執行。
_do_fork函數的主要工作都是在copy_process中進行的。copy_process完成的工作主要包括:必要的檢查,複製必要的數據結構,初始化子進程等。其中,衆多copy_xxx 函數對應複製xxx資源。
copy_process代碼如下:
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
*
* It copies the registers, and all the appropriate
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
static __latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
struct kernel_clone_args *args)
{
struct task_struct *p;
//大量的初始化錯誤檢查
//……
//父進程資源複製給子進程
p = dup_task_struct(current, node);
//大量的p->xxx = xxx; copy_xxx()。用作子進程自己的成員初始化
//……
//分配一個新的pid
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
args->set_tid_size);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
}
}
//……
return p;
//大量的錯誤分支
//……
ERR:
return ERR_PTR(retval);
}
0號進程的建立
上述fork過程,描述了一個父進程建立子進程的過程。但是第一個進程又是如何建立的呢?第一個進程是在內核啓動時直接建立的,一般稱爲0號進程,也就是swapper進程。INIT_TASK代碼位置在include/linux/init_task.h 中,其定義如下:
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
*/
#define INIT_TASK(tsk) \
{ \
INIT_TASK_TI(tsk) \
.state = 0, \
.stack = init_stack, \
.usage = ATOMIC_INIT(2), \
.flags = PF_KTHREAD, \
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \
.normal_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \
.cpus_allowed = CPU_MASK_ALL, \
.nr_cpus_allowed= NR_CPUS, \
.mm = NULL, \
.active_mm = &init_mm, \
.restart_block = { \
.fn = do_no_restart_syscall, \
}, \
.se = { \
.group_node = LIST_HEAD_INIT(tsk.se.group_node), \
}, \
.rt = { \
.run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
.time_slice = RR_TIMESLICE, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
INIT_PUSHABLE_TASKS(tsk) \
INIT_CGROUP_SCHED(tsk) \
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
.real_parent = &tsk, \
.parent = &tsk, \
.children = LIST_HEAD_INIT(tsk.children), \
.sibling = LIST_HEAD_INIT(tsk.sibling), \
.group_leader = &tsk, \
RCU_POINTER_INITIALIZER(real_cred, &init_cred), \
RCU_POINTER_INITIALIZER(cred, &init_cred), \
.comm = INIT_TASK_COMM, \
.thread = INIT_THREAD, \
.fs = &init_fs, \
.files = &init_files, \
.signal = &init_signals, \
.sighand = &init_sighand, \
.nsproxy = &init_nsproxy, \
.pending = { \
.list = LIST_HEAD_INIT(tsk.pending.list), \
.signal = {{0}}}, \
.blocked = {{0}}, \
.alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \
.journal_info = NULL, \
INIT_CPU_TIMERS(tsk) \
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \
.timer_slack_ns = 50000, /* 50 usec default slack */ \
.pids = { \
[PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \
[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \
[PIDTYPE_SID] = INIT_PID_LINK(PIDTYPE_SID), \
}, \
.thread_group = LIST_HEAD_INIT(tsk.thread_group), \
.thread_node = LIST_HEAD_INIT(init_signals.thread_head), \
INIT_IDS \
INIT_PERF_EVENTS(tsk) \
INIT_TRACE_IRQFLAGS \
INIT_LOCKDEP \
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
INIT_TASK_RCU_TASKS(tsk) \
INIT_CPUSET_SEQ(tsk) \
INIT_RT_MUTEXES(tsk) \
INIT_PREV_CPUTIME(tsk) \
INIT_VTIME(tsk) \
INIT_NUMA_BALANCING(tsk) \
INIT_KASAN(tsk) \
INIT_LIVEPATCH(tsk) \
INIT_TASK_SECURITY \
}
init_task的各種資源都是通過INIT_TASK初始化的。在start_kernel中由rest_init函數調用kernel_thread函數,從而以0號進程爲模板建立了kernel_init進程。之後,kernel_init 會建立init進程。從而把啓動過程傳遞到用戶態。
//source 路徑:init/init_task.c
/* Initial task structure */
struct task_struct init_task = INIT_TASK(init_task);
EXPORT_SYMBOL(init_task);
//source 路徑:init/main.c
asmlinkage __visible void __init start_kernel(void)
{
//……
rest_init();
}
static noinline void __ref rest_init(void)
{
struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
pid = kernel_thread(kernel_init, NULL, CLONE_FS);
/*
* Pin init on the boot CPU. Task migration is not properly working
* until sched_init_smp() has been run. It will set the allowed
* CPUs for init to the non isolated CPUs.
*/
rcu_read_lock();
tsk = find_task_by_pid_ns(pid, &init_pid_ns);
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
rcu_read_unlock();
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
/*
* Enable might_sleep() and smp_processor_id() checks.
* They cannot be enabled earlier because with CONFIG_PRREMPT=y
* kernel_thread() would trigger might_sleep() splats. With
* CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
* already, but it's stuck on the kthreadd_done completion.
*/
system_state = SYSTEM_SCHEDULING;
complete(&kthreadd_done);
/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
schedule_preempt_disabled();
/* Call into cpu_idle with preempt disabled */
cpu_startup_entry(CPUHP_ONLINE);
}
3. 進程切換
當前運行的進程可能主動或者被動的放棄CPU,此時就需要發生進程切換。進程切換的主要工作是在context_switch中完成的。
當前進程可能由於進程執行結束,主動放棄CPU;可能由於進程阻塞,主動放棄CPU;可能由於時間片耗盡,被動放棄CPU;可能由於更高優先級的進程搶佔CPU,被動放棄CPU等。
從contex_switch源碼中可以看到,mm爲空的話,說明next進程是內核態線程,這時候,只能借用prev進程當前正在使用的那個地址空間(oldmm = prev->active_mm)。注意:這裏不能借用prev進程的地址空間(prev->mm),因爲prev進程也可能是一個內核態線程,mm也是NULL。
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
* one hypercall.
*/
arch_start_context_switch(prev);
if (!mm) {
next->active_mm = oldmm;
mmgrab(oldmm);
enter_lazy_tlb(oldmm, next);
} else
switch_mm_irqs_off(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
/*
* Since the runqueue lock will be released by the next
* task (which is an invalid locking op but in the case
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
rq_unpin_lock(rq, rf);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
return finish_task_switch(prev);
}
在contex_switch代碼中,前半部分代碼都是在準備合理的next和prev。真正執行上下文切換的是switch_to函數。
以arm64架構爲例,switch_to函數是一個內嵌彙編函數,source 位置在./arch/arm64/kernel/process.c。
/*
* Thread switching.
*/
__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next)
{
struct task_struct *last;
fpsimd_thread_switch(next);
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
contextidr_thread_switch(next);
entry_task_switch(next);
uao_thread_switch(next);
/*
* Complete any pending TLB or cache maintenance on this CPU in case
* the thread migrates to a different CPU.
* This full barrier is also required by the membarrier system
* call.
*/
dsb(ish);
/* the actual thread switch */
last = cpu_switch_to(prev, next);
return last;
}
其中,cpu_switch_to函數是個彙編函數。代碼位置在 arch/arm64/kernel/entry.S
/*
* Register switch for AArch64. The callee-saved registers need to be saved
* and restored. On entry:
* x0 = previous task_struct (must be preserved across the switch)
* x1 = next task_struct
* Previous and next are guaranteed not to be the same.
*
*/
ENTRY(cpu_switch_to)
//獲取THREAD_CPU_CONTEXT並保存到x10中
mov x10, #THREAD_CPU_CONTEXT
//將prev task和THREAD_CPU_CONTEXT相加,放到x8中
add x8, x0, x10
//當前的堆棧指針保存到x9中
mov x9, sp
//保存prev的現場
stp x19, x20, [x8], #16 // store callee-saved registers
stp x21, x22, [x8], #16
stp x23, x24, [x8], #16
stp x25, x26, [x8], #16
stp x27, x28, [x8], #16
stp x29, x9, [x8], #16
str lr, [x8]
//switch到next的現場
add x8, x1, x10
ldp x19, x20, [x8], #16 // restore callee-saved registers
ldp x21, x22, [x8], #16
ldp x23, x24, [x8], #16
ldp x25, x26, [x8], #16
ldp x27, x28, [x8], #16
ldp x29, x9, [x8], #16
ldr lr, [x8]
//恢復堆棧
mov sp, x9
//將next 保存到sp_el0中,防止因溢出而造成的安全隱患。同時,也在thread_info結構體中移除task 成員
msr sp_el0, x1
//返回到__switch_to
ret
ENDPROC(cpu_switch_to)
NOKPROBE(cpu_switch_to)
4. 進程調試技巧
linux內核定義了一個全局變量current用來獲取當前進程的task_struct。由於不同體系結構對current的獲取不同,本文以arm64爲例。代碼如下:./arch/arm64/include/asm/current.h
static __always_inline struct task_struct *get_current(void)
{
unsigned long sp_el0;
asm ("mrs %0, sp_el0" : "=r" (sp_el0));
return (struct task_struct *)sp_el0;
}
#define current get_current()
代碼非常簡潔,就是取sp_el0存的內容,就是當前進程的task_struct。
比較常用的log輸出:
printk(KERN_INFO "name is %s; pid=%i, tgid=%i, mm=%p, active_mm=%p\n",
current->comm, current->pid,current->tgid,current->mm,current->active_mm);
輸出結果如下:
參考文獻
《獨闢蹊徑品內核:Linux內核源代碼導讀》