本文以ARM64爲例,介紹內核的Oops機制,我們使用grep搜索一下內核中可能會報Oops的地方:
./arch/arm64/kernel/sys_compat.c:142: arm64_notify_die("Oops - bad compat syscall(2)", regs, &info, scno);
./arch/arm64/kernel/traps.c:771: die("Oops - bad mode", regs, 0);
./arch/arm64/kernel/traps.c:929: die("Oops - BUG", regs, 0);
./arch/arm64/mm/fault.c:270: die("Oops", regs, esr);
搜索結果如上所示,一共有這幾個地方定義爲Oops,因此Oops可能包含如下一些場景:
- 64bit 系統調用發生了錯誤,報Oops
- CPU陷入了某種不正常的exception mode,在該exception對應的exception vector entry中直接報Oops
- traps中定義的BUG()函數被調用觸發了Oops
- 內核空間中發生了內存地址相關的訪問異常
本文着重從第4種情況來入手跟蹤Oops的發生過程:
在代碼文件 ./arch/arm64/mm/fault.c 中:
do_translation_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_alignment_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_page_fault --> __do_kernel_fault --> die_kernel_fault
調用路徑如上所示,當內核訪問一個內存地址發生錯誤時會分別調用 do_xxx_fault
該函數最終的目標是 die_kernel_fault:
static void die_kernel_fault(const char *msg, unsigned long addr,
unsigned int esr, struct pt_regs *regs)
{
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
addr);
mem_abort_decode(esr);
show_pte(addr);
die("Oops", regs, esr);
bust_spinlocks(0);
do_exit(SIGKILL);
}
這裏最終會調用 die("Oops", regs, esr)
函數:
/*
* This function is protected against re-entrancy.
*/
void die(const char *str, struct pt_regs *regs, int err)
{
int ret;
unsigned long flags;
raw_spin_lock_irqsave(&die_lock, flags);
oops_enter();
console_verbose();
bust_spinlocks(1);
ret = __die(str, err, regs); // 其中會發送 notify_die 通知
if (regs && kexec_should_crash(current))
crash_kexec(regs);
bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
oops_exit();
if (in_interrupt())
panic("Fatal exception in interrupt");
if (panic_on_oops) // 判斷是否要執行panic操作
panic("Fatal exception");
raw_spin_unlock_irqrestore(&die_lock, flags);
if (ret != NOTIFY_STOP)
do_exit(SIGSEGV);
}
在die中可以看到如果配置了panic_on_oops爲1,那麼纔會直接觸發panic操作,如果沒有配置爲1,並不會導致系統panic重啓。Oops都會打印內核調用棧。
一種手動觸發panic的機制
利用sysrq機制可以觸發kernel crash:
echo c > /proc/sysrq-trigger
這種方式就是利用Oops機制來觸發panic的:
static void sysrq_handle_crash(int key)
{
char *killer = NULL;
/* we need to release the RCU read lock here,
* otherwise we get an annoying
* 'BUG: sleeping function called from invalid context'
* complaint from the kernel before the panic.
*/
rcu_read_unlock();
panic_on_oops = 1; /* force panic */ //-------- (1)
wmb();
*killer = 1; //---------------------------(2)
}
- 第(1)步先配置panic_on_oops爲1,使得當內核oops時直接觸發panic操作
- 第(2)步訪問一個內核NULL空地址,觸發oops操作
到這裏可能很多人會有一個疑惑,對一個內核空地址賦值,是如何產生了Oops呢?
查看異常arm64向量表:
/*
* EL1 mode handlers.
*/
el1_da:
/*
* Data abort handling
*/
mrs x3, far_el1
inherit_daif pstate=x23, tmp=x2
clear_address_tag x0, x3
mov x2, sp // struct pt_regs
bl do_mem_abort
kernel_exit 1
......
el0_da:
/*
* Data abort handling
*/
mrs x26, far_el1
enable_daif
ct_user_exit
clear_address_tag x0, x26
mov x1, x25
mov x2, sp
bl do_mem_abort
b ret_to_user
其中el1_da和el1_da中會調用到do_mem_abort,這個向量函數是在CPU運行時發生了data abort異常時進入的一種模式,並且會執行到向量表中對應的函數。
asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
{
const struct fault_info *inf = esr_to_fault_info(esr);
struct siginfo info;
if (!inf->fn(addr, esr, regs))
return;
if (!user_mode(regs)) {
pr_alert("Unhandled fault at 0x%016lx\n", addr);
mem_abort_decode(esr);
show_pte(addr);
}
clear_siginfo(&info);
info.si_signo = inf->sig;
info.si_errno = 0;
info.si_code = inf->code;
info.si_addr = (void __user *)addr;
arm64_notify_die(inf->name, regs, &info, esr);
}
其中對應一個系統錯誤處理列表:
static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
{
return fault_info + (esr & 63);
}
static const struct fault_info fault_info[] = {
{ do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" },
{ do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" },
{ do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" },
{ do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 8" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 12" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
{ do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
{ do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 17" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 19" },
{ do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
{ do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" },
{ do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" },
{ do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" },
{ do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented
{ do_bad, SIGKILL, SI_KERNEL, "unknown 25" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 26" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 27" },
{ do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
{ do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
{ do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
{ do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
{ do_bad, SIGKILL, SI_KERNEL, "unknown 32" },
{ do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 34" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 35" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 36" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 37" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 38" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 39" },
{ do_bad, SIGKILL, SI_KERNEL, "unknown 40" },
......
經過這一系列的調用,最終內核會運行對應的錯誤處理函數。