Linux kernel oops

本文以ARM64爲例,介紹內核的Oops機制,我們使用grep搜索一下內核中可能會報Oops的地方:

./arch/arm64/kernel/sys_compat.c:142:	arm64_notify_die("Oops - bad compat syscall(2)", regs, &info, scno);
./arch/arm64/kernel/traps.c:771:	die("Oops - bad mode", regs, 0);
./arch/arm64/kernel/traps.c:929:		die("Oops - BUG", regs, 0);
./arch/arm64/mm/fault.c:270:	die("Oops", regs, esr);

搜索結果如上所示,一共有這幾個地方定義爲Oops,因此Oops可能包含如下一些場景:

  1. 64bit 系統調用發生了錯誤,報Oops
  2. CPU陷入了某種不正常的exception mode,在該exception對應的exception vector entry中直接報Oops
  3. traps中定義的BUG()函數被調用觸發了Oops
  4. 內核空間中發生了內存地址相關的訪問異常

本文着重從第4種情況來入手跟蹤Oops的發生過程:

在代碼文件 ./arch/arm64/mm/fault.c 中:

do_translation_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_alignment_fault --> do_bad_area --> __do_kernel_fault --> die_kernel_fault
do_page_fault --> __do_kernel_fault --> die_kernel_fault

調用路徑如上所示,當內核訪問一個內存地址發生錯誤時會分別調用 do_xxx_fault 該函數最終的目標是 die_kernel_fault:

static void die_kernel_fault(const char *msg, unsigned long addr,
                 unsigned int esr, struct pt_regs *regs)
{
    bust_spinlocks(1);

    pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
         addr);

    mem_abort_decode(esr);

    show_pte(addr);
    die("Oops", regs, esr);
    bust_spinlocks(0);
    do_exit(SIGKILL);
}

這裏最終會調用 die("Oops", regs, esr) 函數:

/*
 * This function is protected against re-entrancy.
 */
void die(const char *str, struct pt_regs *regs, int err)
{
    int ret;
    unsigned long flags;

    raw_spin_lock_irqsave(&die_lock, flags);

    oops_enter();

    console_verbose();
    bust_spinlocks(1);
    ret = __die(str, err, regs); // 其中會發送 notify_die 通知

    if (regs && kexec_should_crash(current))
        crash_kexec(regs);

    bust_spinlocks(0);
    add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
    oops_exit();

    if (in_interrupt())
        panic("Fatal exception in interrupt");
    if (panic_on_oops)     // 判斷是否要執行panic操作
        panic("Fatal exception");

    raw_spin_unlock_irqrestore(&die_lock, flags);

    if (ret != NOTIFY_STOP)
        do_exit(SIGSEGV);
}

在die中可以看到如果配置了panic_on_oops爲1,那麼纔會直接觸發panic操作,如果沒有配置爲1,並不會導致系統panic重啓。Oops都會打印內核調用棧。

一種手動觸發panic的機制

利用sysrq機制可以觸發kernel crash:

echo c > /proc/sysrq-trigger

這種方式就是利用Oops機制來觸發panic的:

static void sysrq_handle_crash(int key)
{
    char *killer = NULL;

    /* we need to release the RCU read lock here,
     * otherwise we get an annoying
     * 'BUG: sleeping function called from invalid context'
     * complaint from the kernel before the panic.
     */
    rcu_read_unlock();
    panic_on_oops = 1;  /* force panic */  //-------- (1)
    wmb();
    *killer = 1; //---------------------------(2)
}
  • 第(1)步先配置panic_on_oops爲1,使得當內核oops時直接觸發panic操作
  • 第(2)步訪問一個內核NULL空地址,觸發oops操作

到這裏可能很多人會有一個疑惑,對一個內核空地址賦值,是如何產生了Oops呢?

查看異常arm64向量表:

 /*
  * EL1 mode handlers.
  */

 el1_da:
     /*
      * Data abort handling
      */
     mrs x3, far_el1
     inherit_daif    pstate=x23, tmp=x2
     clear_address_tag x0, x3
     mov x2, sp              // struct pt_regs
     bl  do_mem_abort

     kernel_exit 1
......

el0_da:
    /*
     * Data abort handling
     */
    mrs x26, far_el1
    enable_daif
    ct_user_exit
    clear_address_tag x0, x26
    mov x1, x25
    mov x2, sp
    bl  do_mem_abort
    b   ret_to_user


其中el1_da和el1_da中會調用到do_mem_abort,這個向量函數是在CPU運行時發生了data abort異常時進入的一種模式,並且會執行到向量表中對應的函數。

asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
                     struct pt_regs *regs)
{
    const struct fault_info *inf = esr_to_fault_info(esr);
    struct siginfo info;

    if (!inf->fn(addr, esr, regs))
        return;

    if (!user_mode(regs)) {
        pr_alert("Unhandled fault at 0x%016lx\n", addr);
        mem_abort_decode(esr);
        show_pte(addr);
    }

    clear_siginfo(&info);
    info.si_signo = inf->sig;
    info.si_errno = 0;
    info.si_code  = inf->code;
    info.si_addr  = (void __user *)addr;
    arm64_notify_die(inf->name, regs, &info, esr);
}

其中對應一個系統錯誤處理列表:

static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
{
    return fault_info + (esr & 63);
}

static const struct fault_info fault_info[] = {
    { do_bad,       SIGKILL, SI_KERNEL, "ttbr address size fault"   },
    { do_bad,       SIGKILL, SI_KERNEL, "level 1 address size fault"    },
    { do_bad,       SIGKILL, SI_KERNEL, "level 2 address size fault"    },
    { do_bad,       SIGKILL, SI_KERNEL, "level 3 address size fault"    },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 0 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 1 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 2 translation fault" },
    { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 3 translation fault" },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 8"         },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 access flag fault" },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 access flag fault" },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 access flag fault" },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 12"            },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 1 permission fault"  },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 2 permission fault"  },
    { do_page_fault,    SIGSEGV, SEGV_ACCERR,   "level 3 permission fault"  },
    { do_sea,       SIGBUS,  BUS_OBJERR,    "synchronous external abort"    },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 17"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 18"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 19"            },
    { do_sea,       SIGKILL, SI_KERNEL, "level 0 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 1 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 2 (translation table walk)"  },
    { do_sea,       SIGKILL, SI_KERNEL, "level 3 (translation table walk)"  },
    { do_sea,       SIGBUS,  BUS_OBJERR,    "synchronous parity or ECC error" },    // Reserved when RAS is implemented
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 25"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 26"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 27"            },
    { do_sea,       SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_sea,       SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" },  // Reserved when RAS is implemented
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 32"            },
    { do_alignment_fault,   SIGBUS,  BUS_ADRALN,    "alignment fault"       },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 34"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 35"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 36"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 37"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 38"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 39"            },
    { do_bad,       SIGKILL, SI_KERNEL, "unknown 40"            },
......

經過這一系列的調用,最終內核會運行對應的錯誤處理函數。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章