Linux 內核分析——【實驗五:系統調用運行機制】
前文提到系統調用是通過int 0x80來產生的,所以從本質上來說它是一種中斷。那麼什麼是中斷呢?中斷被定義爲一個事件,該事件改變了處理器執行的指令順序。在linux系統下設置了256箇中斷,每個中斷由0~255之間的數來標識,系統調用對應的就是0x80。
首先,我們總結一下系統調用的執行過程:
1、程序調用libc庫中封裝的系統調用函數。
2、調用中斷int 0x80 陷入內核。
3、在內核中執行system_call函數(實際上是一段彙編代碼),將系統調用號(eax)和可以所有相關寄存器保存到內核堆棧中(由SAVE_ALL完成),然後根據系統調用號在系統調用表中查找到對應的系統調用服務例程。
4、執行該服務例程。
5、執行完畢後,轉入ret_from_sys_call 例程,從系統調用返回
接着,我們通過一個簡單的例子,來了解系統調用函數在進入system_call到iret的執行過程。代碼如下:
運行qemu並使用gdb進行調試,步驟如下:
將斷點設在system_call位置,可以看到system_call在一個entry_32.S的彙編文件中。繼續執行,但gdb並沒有在system_call的位置停止,而是直接運行結束了。
下面,我們直接打開entry_32.S文件,找到system_call的位置進行分析:
# system call handler stub
ENTRY(system_call) #系統調用處理入口(內核態)
RING0_INT_FRAME # can't unwind into user space anyway
ASM_CLAC
pushl_cfi %eax # save orig_eax #保存eax,也就是調用號
SAVE_ALL # 保存寄存器
GET_THREAD_INFO(%ebp) # 獲取thread_info結構中ebp的值
# system call tracing in operation / emulation
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) # 檢測是否由系統跟蹤
jnz syscall_trace_entry # 有系統跟蹤則先去執行
cmpl $(NR_syscalls), %eax # 比較輸入的系統調用號 是否大於 最大的系統調用號
jae syscall_badsys # 大於 則無效,退出
syscall_call:
call *sys_call_table(,%eax,4) # 在系統調用表中的調用相應的服務例程,eax爲調用號,4字節對齊
syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value # 保存返回值
syscall_exit:
LOCKDEP_SYS_EXIT # 用於調試,只有開啓調試後纔會檢測系統調用深度
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF # 關閉中斷跟蹤
movl TI_flags(%ebp), %ecx # 檢測是否還有其他任務
testl $_TIF_ALLWORK_MASK, %ecx # current->work
jne syscall_exit_work
syscall_exit_work:
testl $_TIF_WORK_SYSCALL_EXIT, %ecx
jz work_pending # 測試是否退出前還有工作要處理,如果有的話跳轉到work_pending
TRACE_IRQS_ON # 開啓系統中斷跟蹤
ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
# 允許中斷 # schedule() instead
movl %esp, %eax
call syscall_trace_leave
jmp resume_userspace # 恢復用戶空間
END(syscall_exit_work)
work_pending:
testb $_TIF_NEED_RESCHED, %cl # 是否有需要繼續調度的相關信號
jz work_notifysig # 跳轉到處理信號相關的代碼處
work_resched:
call schedule # 時間調度, 進程調度的時機在這裏處理
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
andl $_TIF_WORK_MASK, %ecx # is there any work to be done other 是否還有其他工作要處理
# than syscall tracing?
jz restore_all #如果沒有的話就恢復中斷上下文,也就是恢復進入之前保存的寄存器相關內容
testb $_TIF_NEED_RESCHED, %cl
jnz work_resched
work_notifysig: # deal with pending signals and
# notify-resume requests
#ifdef CONFIG_VM86
testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
restore_all:
TRACE_IRQS_IRET # 恢復中斷跟蹤
restore_all_notrace:
#ifdef CONFIG_X86_ESPFIX32
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
# See comments in process.c:copy_thread() for details.
movb PT_OLDSS(%esp), %ah
movb PT_CS(%esp), %al
andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
#endif
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
INTERRUPT_RETURN
系統調用函數的運行流程如下:
注意:系統調用的初始化工作,是在內核啓動之初/linux-3.18.6/init/main.c文件中
start_kernel() —> trap_init() —> set_system_trap_gate(SYSCALL_VECTOR, &system_call);
附錄:
查看系統調用號:linux-3.18.6/arch/x86/syscalls/syscall_32.tbl
# 32-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point> <compat entry point>
#
# The abi is always "i386" for this file.
#
0 i386 restart_syscall sys_restart_syscall
1 i386 exit sys_exit
2 i386 fork sys_fork stub32_fork
3 i386 read sys_read
4 i386 write sys_write
5 i386 open sys_open compat_sys_open
6 i386 close sys_close
7 i386 waitpid sys_waitpid sys32_waitpid
8 i386 creat sys_creat
9 i386 link sys_link
10 i386 unlink sys_unlink
11 i386 execve sys_execve stub32_execve
12 i386 chdir sys_chdir
13 i386 time sys_time compat_sys_time
14 i386 mknod sys_mknod
15 i386 chmod sys_chmod
16 i386 lchown sys_lchown16
17 i386 break
18 i386 oldstat sys_stat
19 i386 lseek sys_lseek compat_sys_lseek
20 i386 getpid sys_getpid
21 i386 mount sys_mount compat_sys_mount
22 i386 umount sys_oldumount
23 i386 setuid sys_setuid16
24 i386 getuid sys_getuid16
25 i386 stime sys_stime compat_sys_stime
26 i386 ptrace sys_ptrace compat_sys_ptrace
27 i386 alarm sys_alarm
28 i386 oldfstat sys_fstat
29 i386 pause sys_pause
30 i386 utime sys_utime compat_sys_utime
31 i386 stty
32 i386 gtty
33 i386 access sys_access
34 i386 nice sys_nice
35 i386 ftime
36 i386 sync sys_sync
37 i386 kill sys_kill
38 i386 rename sys_rename
39 i386 mkdir sys_mkdir
40 i386 rmdir sys_rmdir
41 i386 dup sys_dup
42 i386 pipe sys_pipe
43 i386 times sys_times compat_sys_times
44 i386 prof
45 i386 brk sys_brk
46 i386 setgid sys_setgid16
47 i386 getgid sys_getgid16
48 i386 signal sys_signal
49 i386 geteuid sys_geteuid16
50 i386 getegid sys_getegid16
51 i386 acct sys_acct
52 i386 umount2 sys_umount
53 i386 lock
54 i386 ioctl sys_ioctl compat_sys_ioctl
55 i386 fcntl sys_fcntl compat_sys_fcntl64
56 i386 mpx
57 i386 setpgid sys_setpgid
58 i386 ulimit
59 i386 oldolduname sys_olduname
60 i386 umask sys_umask
61 i386 chroot sys_chroot
62 i386 ustat sys_ustat compat_sys_ustat
63 i386 dup2 sys_dup2
64 i386 getppid sys_getppid
65 i386 getpgrp sys_getpgrp
66 i386 setsid sys_setsid
67 i386 sigaction
... ...
=========== 王傑 原創作品轉載請註明出處==============
《Linux內核分析》MOOC課程http://mooc.study.163.com/course/USTC-1000029000