linux內核初始化

Linux內核的啓動從入口函數start_kernel開始,在init/main.c中,start_kernel就相當於內核的main函數。該函數中調用了很多的xxx_init函數。

創建0號進程

set_task_stack_end_magic(&init_task);

init_task是系統創建第一個進程,也稱爲0號進程,這也是唯一一個沒有使用fork或者kernel_thread產生的進程,是進程列表的第一個,它的定義如下:

struct task_struct init_task{
	.state		= 0,
	.stack		= init_stack,
	.usage		= ATOMIC_INIT(2),
	.flags		= PF_KTHREAD,
	.prio		= MAX_PRIO - 20,
	.static_prio	= MAX_PRIO - 20,
	.normal_prio	= MAX_PRIO - 20,
	.policy		= SCHED_NORMAL,
	.cpus_allowed	= CPU_MASK_ALL,
	.nr_cpus_allowed= NR_CPUS,
	.mm		= NULL,
	.active_mm	= &init_mm,
	.restart_block	= {
		.fn = do_no_restart_syscall,
	},
	.se		= {
		.group_node 	= LIST_HEAD_INIT(init_task.se.group_node),
	},
	.rt		= {
		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),
		.time_slice	= RR_TIMESLICE,
	},
	.tasks		= LIST_HEAD_INIT(init_task.tasks),
	.ptraced	= LIST_HEAD_INIT(init_task.ptraced),
	.ptrace_entry	= LIST_HEAD_INIT(init_task.ptrace_entry),
	.real_parent	= &init_task,
	.parent		= &init_task,
	.children	= LIST_HEAD_INIT(init_task.children),
	.sibling	= LIST_HEAD_INIT(init_task.sibling),
	.group_leader	= &init_task,
	RCU_POINTER_INITIALIZER(real_cred, &init_cred),
	RCU_POINTER_INITIALIZER(cred, &init_cred),
	.comm		= INIT_TASK_COMM,
	.thread		= INIT_THREAD,
	.fs		= &init_fs,
	.files		= &init_files,
	.signal		= &init_signals,
	.sighand	= &init_sighand,
	.nsproxy	= &init_nsproxy,
	.pending	= {
		.list = LIST_HEAD_INIT(init_task.pending.list),
		.signal = {{0}}
	},
	.blocked	= {{0}},
	.alloc_lock	= __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
	.journal_info	= NULL,
	INIT_CPU_TIMERS(init_task)
	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
	.timer_slack_ns = 50000, /* 50 usec default slack */
	.thread_pid	= &init_struct_pid,
	.thread_group	= LIST_HEAD_INIT(init_task.thread_group),
	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_PREEMPT_RCU
	.rcu_read_lock_nesting = 0,
	.rcu_read_unlock_special.s = 0,
	.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
	.rcu_blocked_node = NULL,
	INIT_PREV_CPUTIME(init_task)
}

中斷門

函數trap_init裏面設置了很多的中斷門,用於處理各種中斷,其中的 SYSG(IA32_SYSCALL_VECTOR, entry_INT80_32),就是系統調用的中斷門,用戶態進程調用系統調用也是通過發送中斷的方式進行的。
函數調用關係如下

trap_init
		--	idt_setup_traps();
				-	idt_setup_from_table(idt_table, def_idts, ARRAY_SIZE(def_idts), true);

其中def_idts的詳細定義如下,這裏面就定義了各種中斷門。

static const __initconst struct idt_data def_idts[] = {
	INTG(X86_TRAP_DE,		divide_error),
	INTG(X86_TRAP_NMI,		nmi),
	INTG(X86_TRAP_BR,		bounds),
	INTG(X86_TRAP_UD,		invalid_op),
	INTG(X86_TRAP_NM,		device_not_available),
	INTG(X86_TRAP_OLD_MF,		coprocessor_segment_overrun),
	INTG(X86_TRAP_TS,		invalid_TSS),
	INTG(X86_TRAP_NP,		segment_not_present),
	INTG(X86_TRAP_SS,		stack_segment),
	INTG(X86_TRAP_GP,		general_protection),
	INTG(X86_TRAP_SPURIOUS,		spurious_interrupt_bug),
	INTG(X86_TRAP_MF,		coprocessor_error),
	INTG(X86_TRAP_AC,		alignment_check),
	INTG(X86_TRAP_XF,		simd_coprocessor_error),

#ifdef CONFIG_X86_32
	TSKG(X86_TRAP_DF,		GDT_ENTRY_DOUBLEFAULT_TSS),
#else
	INTG(X86_TRAP_DF,		double_fault),
#endif
	INTG(X86_TRAP_DB,		debug),

#ifdef CONFIG_X86_MCE
	INTG(X86_TRAP_MC,		&machine_check),
#endif

	SYSG(X86_TRAP_OF,		overflow),
#if defined(CONFIG_IA32_EMULATION)
	SYSG(IA32_SYSCALL_VECTOR,	entry_INT80_compat),
#elif defined(CONFIG_X86_32)
	SYSG(IA32_SYSCALL_VECTOR,	entry_INT80_32),
#endif
};

mm_init函數用於初始化內存管理模塊。
sched_init函數用於初始化內核調度模塊。主要代碼如下:

for_each_possible_cpu(i){
		struct rq *rq;
		rq = cpu_rq(i);
		raw_spin_lock_init(&rq->lock);
		rq->nr_running = 0;
		rq->calc_load_active = 0;
		rq->calc_load_update = jiffies + LOAD_FREQ;
		init_cfs_rq(&rq->cfs);
		init_rt_rq(&rq->rt);
		init_dl_rq(&rq->dl);
		hrtick_rq_init(rq);
		atomic_set(&rq->nr_iowait, 0);
	}

上面的代碼中首先取到每個CPU上的進程運行隊列,初始化每一個進程的自旋鎖,然後又一次初始化rq的字隊列cfs_rq、rt_rq和dl_rq等。

文件系統初始化

vfs_caches_init();用來初始化基於內存的文件系統rootfs,該函數會一次調用mnt_init()->init_rootfs().

int __init init_rootfs(void)
{
	int err = register_filesystem(&rootfs_fs_type);
	if (err)
		return err;

	if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
		err = shmem_init();
		is_tmpfs = true;
	} else {
		err = init_ramfs_fs();
	}
	if (err)
		unregister_filesystem(&rootfs_fs_type);

	return err;
}

上面的register_filesystem在linux的虛擬文件系統裏面註冊了一種文件系統類型爲rootfs_fs_type。

最後start_kernel調用了rest_init用來做其他方面的初始化工作。

創建1號進程

rest_init的第一個工作就是調用kernel_thread創建第二個進程也就是1號進程。

/*
	 * We need to spawn init first so that it obtains pid 1, however
	 * the init task will end up wanting to create kthreads, which, if
	 * we schedule it before we create kthreadd, will OOPS.
	 */
	pid = kernel_thread(kernel_init, NULL, CLONE_FS);

1號進程是系統運行的第一個用戶進程,1號進程創建的時候還在內核態,那麼又是怎麼切換到用戶態的呢?
Kernel_thread的第一個參數是kernel_init,進程創建後就會運行這個kernel_init函數。函數具體實現如下

static int __ref kernel_init(void *unused)
{
	int ret;
	kernel_init_freeable();
	/* need to finish all async __init code before freeing the memory */
	async_synchronize_full();
	ftrace_free_init_mem();
	jump_label_invalidate_initmem();
	free_initmem();
	mark_readonly();
	/*
	 * Kernel mappings are now finalized - update the userspace page-table
	 * to finalize PTI.
	 */
	pti_finalize();
	system_state = SYSTEM_RUNNING;
	numa_default_policy();
	rcu_end_inkernel_boot();
	if (ramdisk_execute_command) {
		ret = run_init_process(ramdisk_execute_command);
		if (!ret)
			return 0;
		pr_err("Failed to execute %s (error %d)\n",
		       ramdisk_execute_command, ret);
	}
	/*
	 * We try each of these until one succeeds.
	 *
	 * The Bourne shell can be used instead of init if we are
	 * trying to recover a really broken machine.
	 */
	if (execute_command) {
		ret = run_init_process(execute_command);
		if (!ret)
			return 0;
		panic("Requested init %s failed (error %d).",
		      execute_command, ret);
	}
	if (!try_to_run_init_process("/sbin/init") ||
	    !try_to_run_init_process("/etc/init") ||
	    !try_to_run_init_process("/bin/init") ||
	    !try_to_run_init_process("/bin/sh"))
		return 0;
	panic("No working init found.  Try passing init= option to kernel. "
	      "See Linux Documentation/admin-guide/init.rst for guidance.");
}

其中在kernel_init_freeable函數有如下的調用代碼

if (!ramdisk_execute_command)
		ramdisk_execute_command = "/init";

這裏將ramdisk_execute_comman設置爲"/init"

然後kernel_init後面還有下面的代碼

if (ramdisk_execute_command) {
		ret = run_init_process(ramdisk_execute_command);
		if (!ret)
			return 0;
		pr_err("Failed to execute %s (error %d)\n",
		       ramdisk_execute_command, ret);
	}
if (execute_command) {
		ret = run_init_process(execute_command);
		if (!ret)
			return 0;
		panic("Requested init %s failed (error %d).",
		      execute_command, ret);
	}
	if (!try_to_run_init_process("/sbin/init") ||
	    !try_to_run_init_process("/etc/init") ||
	    !try_to_run_init_process("/bin/init") ||
	    !try_to_run_init_process("/bin/sh"))
		return 0;

此時ramdisk_execute_command不爲空程序將執行run_init_process函數,在該函數內部調用do_execve了函數,這裏就是系統調用execve函數的內核實現,他的作用就是運行一個文件,因此程序運行到這裏就會嘗試運行ramdisk的init進程或者是普通文件系統上的/sbin/init、"/etc/init"、"/bin/init"、"/bin/sh"。

創建2號進程

rest_init的第二個工作就是調用kernel_thread創建第三個進程 即2號進程。

pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);

這裏有一次使用了kernel_thread函數創建進程,這裏創建的kthreadd負責所有內核態的線程調度和管理,是內核態所有線程的祖先。Kthreadd的主要代碼如下

For(;;){
		set_current_state(TASK_INTERRUPTIBLE);
		if (list_empty(&kthread_create_list))
			schedule();
		__set_current_state(TASK_RUNNING);

		spin_lock(&kthread_create_lock);
		while (!list_empty(&kthread_create_list)) {
			struct kthread_create_info *create;

			create = list_entry(kthread_create_list.next,
					    struct kthread_create_info, list);
			list_del_init(&create->list);
			spin_unlock(&kthread_create_lock);

			create_kthread(create);

			spin_lock(&kthread_create_lock);
		}
		spin_unlock(&kthread_create_lock);
	}

該函數的核心就是一個for循環和一個while循環,for循環中首先將線程狀態設置爲可中斷的睡眠狀態,然後判斷線程鏈表是否爲空,如果爲空則執行一次線程調度讓出CPU。如果變成鏈表不爲空則進入while循環,最終會進入到create_kthread函數

create_kthread
	- 	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
		- _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
		(unsigned long)arg, NULL, NULL, 0);

最終會調用到_do_fork函數完成線程創建。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章