Linux內核的啓動從入口函數start_kernel開始,在init/main.c中,start_kernel就相當於內核的main函數。該函數中調用了很多的xxx_init函數。
創建0號進程
set_task_stack_end_magic(&init_task);
init_task是系統創建第一個進程,也稱爲0號進程,這也是唯一一個沒有使用fork或者kernel_thread產生的進程,是進程列表的第一個,它的定義如下:
struct task_struct init_task{
.state = 0,
.stack = init_stack,
.usage = ATOMIC_INIT(2),
.flags = PF_KTHREAD,
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
.cpus_allowed = CPU_MASK_ALL,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
.restart_block = {
.fn = do_no_restart_syscall,
},
.se = {
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
.time_slice = RR_TIMESLICE,
},
.tasks = LIST_HEAD_INIT(init_task.tasks),
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
.real_parent = &init_task,
.parent = &init_task,
.children = LIST_HEAD_INIT(init_task.children),
.sibling = LIST_HEAD_INIT(init_task.sibling),
.group_leader = &init_task,
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
RCU_POINTER_INITIALIZER(cred, &init_cred),
.comm = INIT_TASK_COMM,
.thread = INIT_THREAD,
.fs = &init_fs,
.files = &init_files,
.signal = &init_signals,
.sighand = &init_sighand,
.nsproxy = &init_nsproxy,
.pending = {
.list = LIST_HEAD_INIT(init_task.pending.list),
.signal = {{0}}
},
.blocked = {{0}},
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
.journal_info = NULL,
INIT_CPU_TIMERS(init_task)
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
.timer_slack_ns = 50000, /* 50 usec default slack */
.thread_pid = &init_struct_pid,
.thread_group = LIST_HEAD_INIT(init_task.thread_group),
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_PREEMPT_RCU
.rcu_read_lock_nesting = 0,
.rcu_read_unlock_special.s = 0,
.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
.rcu_blocked_node = NULL,
INIT_PREV_CPUTIME(init_task)
}
中斷門
函數trap_init裏面設置了很多的中斷門,用於處理各種中斷,其中的 SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),就是系統調用的中斷門,用戶態進程調用系統調用也是通過發送中斷的方式進行的。
函數調用關係如下
trap_init
-- idt_setup_traps();
- idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);
其中def_idts的詳細定義如下,這裏面就定義了各種中斷門。
static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DE, divide_error),
INTG(X86_TRAP_NMI, nmi),
INTG(X86_TRAP_BR, bounds),
INTG(X86_TRAP_UD, invalid_op),
INTG(X86_TRAP_NM, device_not_available),
INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
INTG(X86_TRAP_TS, invalid_TSS),
INTG(X86_TRAP_NP, segment_not_present),
INTG(X86_TRAP_SS, stack_segment),
INTG(X86_TRAP_GP, general_protection),
INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
INTG(X86_TRAP_MF, coprocessor_error),
INTG(X86_TRAP_AC, alignment_check),
INTG(X86_TRAP_XF, simd_coprocessor_error),
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
#else
INTG(X86_TRAP_DF, double_fault),
#endif
INTG(X86_TRAP_DB, debug),
#ifdef CONFIG_X86_MCE
INTG(X86_TRAP_MC, &machine_check),
#endif
SYSG(X86_TRAP_OF, overflow),
#if defined(CONFIG_IA32_EMULATION)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
#elif defined(CONFIG_X86_32)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),
#endif
};
mm_init函數用於初始化內存管理模塊。
sched_init函數用於初始化內核調度模塊。主要代碼如下:
for_each_possible_cpu(i){
struct rq *rq;
rq = cpu_rq(i);
raw_spin_lock_init(&rq->lock);
rq->nr_running = 0;
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
}
上面的代碼中首先取到每個CPU上的進程運行隊列,初始化每一個進程的自旋鎖,然後又一次初始化rq的字隊列cfs_rq、rt_rq和dl_rq等。
文件系統初始化
vfs_caches_init();用來初始化基於內存的文件系統rootfs,該函數會一次調用mnt_init()->init_rootfs().
int __init init_rootfs(void)
{
int err = register_filesystem(&rootfs_fs_type);
if (err)
return err;
if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
err = shmem_init();
is_tmpfs = true;
} else {
err = init_ramfs_fs();
}
if (err)
unregister_filesystem(&rootfs_fs_type);
return err;
}
上面的register_filesystem在linux的虛擬文件系統裏面註冊了一種文件系統類型爲rootfs_fs_type。
最後start_kernel調用了rest_init用來做其他方面的初始化工作。
創建1號進程
rest_init的第一個工作就是調用kernel_thread創建第二個進程也就是1號進程。
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
pid = kernel_thread(kernel_init, NULL, CLONE_FS);
1號進程是系統運行的第一個用戶進程,1號進程創建的時候還在內核態,那麼又是怎麼切換到用戶態的呢?
Kernel_thread的第一個參數是kernel_init,進程創建後就會運行這個kernel_init函數。函數具體實現如下
static int __ref kernel_init(void *unused)
{
int ret;
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
ftrace_free_init_mem();
jump_label_invalidate_initmem();
free_initmem();
mark_readonly();
/*
* Kernel mappings are now finalized - update the userspace page-table
* to finalize PTI.
*/
pti_finalize();
system_state = SYSTEM_RUNNING;
numa_default_policy();
rcu_end_inkernel_boot();
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/admin-guide/init.rst for guidance.");
}
其中在kernel_init_freeable函數有如下的調用代碼
if (!ramdisk_execute_command)
ramdisk_execute_command = "/init";
這裏將ramdisk_execute_comman設置爲"/init"
然後kernel_init後面還有下面的代碼
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
此時ramdisk_execute_command不爲空程序將執行run_init_process函數,在該函數內部調用do_execve了函數,這裏就是系統調用execve函數的內核實現,他的作用就是運行一個文件,因此程序運行到這裏就會嘗試運行ramdisk的init進程或者是普通文件系統上的/sbin/init、"/etc/init"、"/bin/init"、"/bin/sh"。
創建2號進程
rest_init的第二個工作就是調用kernel_thread創建第三個進程 即2號進程。
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
這裏有一次使用了kernel_thread函數創建進程,這裏創建的kthreadd負責所有內核態的線程調度和管理,是內核態所有線程的祖先。Kthreadd的主要代碼如下
For(;;){
set_current_state(TASK_INTERRUPTIBLE);
if (list_empty(&kthread_create_list))
schedule();
__set_current_state(TASK_RUNNING);
spin_lock(&kthread_create_lock);
while (!list_empty(&kthread_create_list)) {
struct kthread_create_info *create;
create = list_entry(kthread_create_list.next,
struct kthread_create_info, list);
list_del_init(&create->list);
spin_unlock(&kthread_create_lock);
create_kthread(create);
spin_lock(&kthread_create_lock);
}
spin_unlock(&kthread_create_lock);
}
該函數的核心就是一個for循環和一個while循環,for循環中首先將線程狀態設置爲可中斷的睡眠狀態,然後判斷線程鏈表是否爲空,如果爲空則執行一次線程調度讓出CPU。如果變成鏈表不爲空則進入while循環,最終會進入到create_kthread函數
create_kthread
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
- _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
(unsigned long)arg, NULL, NULL, 0);
最終會調用到_do_fork函數完成線程創建。