kdump之kdump分析

說Kexec是基於kexec機制工作的,但關於Kdump到底是怎麼實現的,
比如將第二個內核怎麼加載到具體的保留位置,第一個內核crash後怎麼傳需要的elfcorehdr和memmap參數給第二個內核,另外第二個內核是怎麼調用makdedumpfile來過濾壓縮頁的,網上一些資料給的都太概括了,還沒找到相關分析的,看了下代碼,有了個大概,可能部分理解有誤,歡迎拍磚和探討.

先看一張圖,這個是網上找到的Vivek Goyal的PPT中兩幅圖,這裏合成一張了

KEXEC的設計是用新內核去覆蓋原內核位置;而KDUMP是預留一塊內存來加載第二個內核(和相關數據),Crash後第二個內核在原位置運行(不然就達不到相關目的了),收集第一個內核的相關內存信息。在KDUMP中Kexec算是一個引導器,類似GRUB(2).  真正的實現是在kexec-tools中,對於RH系列,相關的kexec-tools RPM包中除了封裝相關程序外,還有個/etc/rc.d/init.d/kdump shell腳本來負責將相關工具粘在一起

下面來說下大致流程:
1).第一個內核以crashkernel啓動後,內核解析此crashkernel命令行選項並將此選項值放到crash_res中,並預留相關內存區域

/* crashkernel=size@addr specifies the location to reserve for
 * a crash kernel. By reserving this memory we guarantee
 * that linux never sets it up as a DMA target.
 * Useful for holding code to do something appropriate
 * after a kernel panic.
 */

/* Location of the reserved area for the crash kernel */

struct resource crashk_res = {//參見crash_res定義!

    .name  = "Crash kernel",
    .start = 0,
    .end   = 0,
    .flags = IORESOURCE_BUSY | IORESOURCE_MEM
};//這裏存放的應該是命令行和/proc/iomem看到的0x1000000-0x7ffffff      :  Crash kernel   

static int __init parse_crashkernel(char *arg)
{
    unsigned long size, base;
    size = memparse(arg, &arg);
    if (*arg == '@') {
        base = memparse(arg+1, &arg);
        /* FIXME: Do I want a sanity check
         * to validate the memory range?
         */

        crashk_res.start = base;//存到此處
        crashk_res.end = base + size - 1;
    }
    return 0;
}
early_param("crashkernel", parse_crashkernel);//將與crashkernel關聯的parse_crashkernel放到.init.setup中


/etc/init.d/kdump start啓動時(只摘錄部分相關的)


function save_core()
{
    local kdump_path
    kdump_path=`grep ^path $KDUMP_CONFIG_FILE | cut -d' ' -f2-`
    if [ -"$kdump_path" ]; then
        coredir="/var/crash/`date +"%Y-%m-%d-%H:%M"`"
    else
        coredir="${kdump_path}/`date +"%Y-%m-%d-%H:%M"`"
    fi

    mkdir -$coredir
    cp --sparse=always /proc/vmcore $coredir/vmcore-incomplete
    exitcode=$?
    if [ $exitcode == 0 ]; then
        mv $coredir/vmcore-incomplete $coredir/vmcore
        $LOGGER "saved a vmcore to $coredir"
    else
        $LOGGER "failed to save a vmcore to $coredir"
    fi
    return $exitcode
}
function load_kdump()
{

    if [ -"$KDUMP_COMMANDLINE" ]
    then
        KDUMP_COMMANDLINE=`cat /proc/cmdline`
    fi

    ARCH=`uname -m`
    if [ "$ARCH" == "ppc64" ]
    then
        MEM_RESERVED=`grep "crashkernel=[0-9]\+[MmKkGg]@[0-9]\+[MmGgKk]"/proc/cmdline`
    else
        MEM_RESERVED=`grep "Crash kernel" /proc/iomem | grep -"00000000-00000000"`
    fi
    if [ -"$MEM_RESERVED" ]
    then
        $LOGGER "No crashkernel parameter specified for running kernel"
        return 1
    fi

    if [ "$ARCH" == "i686" -"$ARCH" == "i386" ]
    then

        need_64bit_headers
        if [ $? == 1 ]
        then
            FOUND_ELF_ARGS=`echo $KEXEC_ARGS | grep elf32-core-headers`
            if [ -"$FOUND_ELF_ARGS" ]
            then
                echo -"Warning: elf32-core-headers overrides correct elf64 setting"
                warning
                echo
            else 
                KEXEC_ARGS="$KEXEC_ARGS --elf64-core-headers"
            fi
        else
            FOUND_ELF_ARGS=`echo $KEXEC_ARGS | grep elf64-core-headers`
            if [ -"$FOUND_ELF_ARGS" ]
            then
                KEXEC_ARGS="$KEXEC_ARGS --elf32-core-headers"
            fi
        fi
    fi

    KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -'s/crashkernel=[0-9]\+[MmKkGg]@[0-9]\+[MmGgKk]//'`
    KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -e's/mem=[0-9]\+[GMKgmk]* *//'`
    KDUMP_COMMANDLINE=`echo $KDUMP_COMMANDLINE | sed -e's/hugepages=[0-9]\+ */ /g' -e's/hugepagesz=[0-9]\+[kKmMgG]* */ /g'`

    KDUMP_COMMANDLINE="${KDUMP_COMMANDLINE} ${KDUMP_COMMANDLINE_APPEND}"
    avoid_cdrom_drive
    KDUMP_COMMANDLINE="${KDUMP_COMMANDLINE} ${KDUMP_IDE_NOPROBE_COMMANDLINE}"

#最主要的是這部分

    KEXEC_OUTPUT=`$KEXEC $KEXEC_ARGS $standard_kexec_args \
        --command-line="$KDUMP_COMMANDLINE" \
        --initrd=$kdump_initrd $kdump_kernel 2>&1`
    if [ $? == 0 ]; then
        $LOGGER "kexec: loaded kdump kernel"
        return 0
    else
        $LOGGER $KEXEC_OUTPUT
        $LOGGER "kexec: failed to load kdump kernel"
        return 1
    fi
}

function start()
{
    #TODO check raw partition for core dump image

    status
    rc=$?
    if [ $rc == 2 ]; then
        echo -"Kdump is not supported on this kernel"; failure; echo
        return 1;
    else
        if [ $rc == 0 ]; then
            echo -"Kdump already running"; success; echo
            return 0
        fi
    fi
    check_config
    if [ $? != 0 ]; then
        echo -"Starting kdump:"; failure; echo
        $LOGGER "failed to start up, config file incorrect"
        return 1
    fi
    load_kdump
    if [ $? != 0 ]; then
        echo -"Starting kdump:"; failure; echo
        $LOGGER "failed to start up"
        return 1
    fi

    echo -"Starting kdump:"; success; echo
    $LOGGER "started up"
}

case "$1" in
  start)
    if [ -/proc/vmcore ]; then #第二個內核啓動後走此步!

        run_kdump_pre
        save_core
        run_kdump_post $?
        do_final_action
    else #剛開始走此步!

        start
    fi
    ;;


最後是調用如下形式

kexec --args-linux --elf32(64)-core-headers -p--command-line="$KDUMP_COMMANDLINE" --initrd=$kdump_initrd $kdump_kernel

其中commandline是在配置文件中手動設置的或者從/proc/cmdline得到


這個就到了上次分析kexec的代碼了,注意此處是以-p來調用的


int elf_x86_load(int argc, char **argv, const char *buf, off_t len, 
    struct kexec_info *info)//******************

{
    struct mem_ehdr ehdr;
    const char *command_line;
    char *modified_cmdline;
    int command_line_len;
    int modified_cmdline_len;
    const char *ramdisk;
    unsigned long entry, max_addr;
    int arg_style;
#define ARG_STYLE_ELF 0
#define ARG_STYLE_LINUX 1
#define ARG_STYLE_NONE 2
    int opt;
#define OPT_APPEND        (OPT_ARCH_MAX+0)
#define OPT_REUSE_CMDLINE    (OPT_ARCH_MAX+1)
#define OPT_RAMDISK        (OPT_ARCH_MAX+2)
#define OPT_ARGS_ELF     (OPT_ARCH_MAX+3)
#define OPT_ARGS_LINUX     (OPT_ARCH_MAX+4)
#define OPT_ARGS_NONE     (OPT_ARCH_MAX+5)

    static const struct option options[] = {//參見http://xuwenzhang.org/blog/tag/getopt_long/ noted by peter.guo

        KEXEC_ARCH_OPTIONS
        { "command-line",    1, NULL, OPT_APPEND },
        { "append",        1, NULL, OPT_APPEND },
        { "reuse-cmdline",    1, NULL, OPT_REUSE_CMDLINE },
        { "initrd",        1, NULL, OPT_RAMDISK },
        { "ramdisk",        1, NULL, OPT_RAMDISK },
        { "args-elf",        0, NULL, OPT_ARGS_ELF },
        { "args-linux",        0, NULL, OPT_ARGS_LINUX },
        { "args-none",        0, NULL, OPT_ARGS_NONE },
        { 0,             0, NULL, 0 },
    };

    static const char short_options[] = KEXEC_OPT_STR "";

    /*
     * Parse the command line arguments
     */

    arg_style = ARG_STYLE_ELF;
    command_line = 0;
    modified_cmdline = 0;
    modified_cmdline_len = 0;
    ramdisk = 0;
    while((opt = getopt_long(argc, argv, short_options, options, 0)) != -1){
//屬於gnu體系
        switch(opt) {
        default:
            /* Ignore core options */
            if (opt < OPT_ARCH_MAX) {
                break;
            }
        case '?':
            usage();
            return -1;
        case OPT_APPEND://進入此! 
            command_line = optarg;
            break;
        case OPT_REUSE_CMDLINE:
            command_line = get_command_line();
            break;
        case OPT_RAMDISK: //進入此!

            ramdisk = optarg;
            break;
        case OPT_ARGS_ELF: 
            arg_style = ARG_STYLE_ELF;
            break;
        case OPT_ARGS_LINUX://進入此!

            arg_style = ARG_STYLE_LINUX;
            break;
        case OPT_ARGS_NONE:
#ifdef __i386__
            arg_style = ARG_STYLE_NONE;
#else
            die("--args-none only works on arch i386\n");
#endif
            break;
        }
    }
    command_line_len = 0;
    if (command_line) {
        command_line_len = strlen(command_line) +1;
    }

    /* Need to append some command line parameters internally in case of
     * taking crash dumps.
     */

    if (info->kexec_flags & (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT)) {
        modified_cmdline = xmalloc(COMMAND_LINE_SIZE);//分配一個新的空間來盛命令行!

        memset((void *)modified_cmdline, 0, COMMAND_LINE_SIZE);
        if (command_line) {
            strncpy(modified_cmdline, command_line,
                        COMMAND_LINE_SIZE);
            modified_cmdline[COMMAND_LINE_SIZE - 1] = '\0';
        }
        modified_cmdline_len = strlen(modified_cmdline);
    }

    /* Load the ELF executable */
    elf_exec_build_load(info, &ehdr, buf, len, 0);//========================>


    entry = ehdr.e_entry;
    max_addr = elf_max_addr(&ehdr);

    /* Do we want arguments? */
    if (arg_style != ARG_STYLE_NONE) {//=====>

        /* Load the setup code *///===========>pay more attention to purgatory!!!!!!

        elf_rel_build_load(info, &info->rhdr, (char *) purgatory,purgatory_size,
            0, ULONG_MAX, 1, 0);
    }
    if (arg_style == ARG_STYLE_NONE) {
        info->entry = (void *)entry;

    }
    else if (arg_style == ARG_STYLE_ELF) {
        unsigned long note_base;
        struct entry32_regs regs;
        uint32_t arg1, arg2;

        /* Setup the ELF boot notes */
        note_base = elf_boot_notes(info, max_addr,
            (unsigned char *) command_line, command_line_len);

        /* Initialize the stack arguments */
        arg2 = 0; /* No return address */
        arg1 = note_base;
        elf_rel_set_symbol(&info->rhdr, "stack_arg32_1", &arg1, sizeof(arg1));
        elf_rel_set_symbol(&info->rhdr, "stack_arg32_2", &arg2, sizeof(arg2));
        
        /* Initialize the registers */
        elf_rel_get_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
        regs.eip = entry; /* The entry point */
        regs.esp = elf_rel_get_addr(&info->rhdr, "stack_arg32_2");
        elf_rel_set_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));

        if (ramdisk) {
            die("Ramdisks not supported with generic elf arguments");
        }
    }
    else if (arg_style == ARG_STYLE_LINUX) {//=====>got it !!!!!!

        struct x86_linux_faked_param_header *hdr;
        unsigned long param_base;
        const unsigned char *ramdisk_buf;
        off_t ramdisk_length;
        struct entry32_regs regs;
        int rc = 0;

        /* Get the linux parameter header */
        hdr = xmalloc(sizeof(*hdr));

        /* Hack: With some ld versions, vmlinux program headers show
         * a gap of two pages between bss segment and data segment
         * but effectively kernel considers it as bss segment and
         * overwrites the any data placed there. Hence bloat the
         * memsz of parameter segment to 16K to avoid being placed
         * in such gaps.
         * This is a makeshift solution until it is fixed in kernel
         */

        param_base = add_buffer(info, hdr, sizeof(*hdr), 16*1024,
            16, 0, max_addr, 1);

        /* Initialize the parameter header */
        memset(hdr, 0, sizeof(*hdr));
        init_linux_parameters(&hdr->hdr);

        /* Add a ramdisk to the current image */
        ramdisk_buf = NULL;
        ramdisk_length = 0;
        if (ramdisk) {
            ramdisk_buf = (unsigned char *) slurp_file(ramdisk,&ramdisk_length);
        }

        /* If panic kernel is being loaded, additional segments need
         * to be created. */

        if (info->kexec_flags & (KEXEC_ON_CRASH|KEXEC_PRESERVE_CONTEXT)) {


/*

Command line: ro root=LABEL=/ rhgb quiet irqpoll maxcpus=1 reset_devicesmemmap=exactmap  memmap=640K@0K  memmap=5264K@16384K   
memmap=125152K@22288K  elfcorehdr=147440K (0x8ffc000) 
memmap=56K#1834688K  memmap=136K#1834744K memmap=128K#1834880K memmap=1024K$4193280K //紅色部分'#'代表specific memory  forACPI data. '$'代表specific memory as reserved. 沒在代碼中查找到?

/此處得到相關的memmap和elfcorehdr參數並存入新的命令行參數中

*/   

            rc = load_crashdump_segments(info, modified_cmdline,
                        max_addr, 0);
         if (rc < 0)
                return -1;
            /* Use new command line. */
            command_line = modified_cmdline;
            command_line_len = strlen(modified_cmdline) + 1;
        }

        /* Tell the kernel what is going on */
        setup_linux_bootloader_parameters(info, &hdr->hdr, param_base, 
            offsetof(struct x86_linux_faked_param_header, command_line),
            command_line, command_line_len,
            ramdisk_buf, ramdisk_length);//======>got it !!!!!!


        /* Fill in the information bios calls would usually provide */
        setup_linux_system_parameters(&hdr->hdr, info->kexec_flags);

        /* Initialize the registers */
        elf_rel_get_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
        regs.ebx = 0;        /* Bootstrap processor */
        regs.esi = param_base;    /* Pointer to the parameters */
        regs.eip = entry;    /* The entry point */
        regs.esp = elf_rel_get_addr(&info->rhdr, "stack_end"); /* Stack, unused */
        elf_rel_set_symbol(&info->rhdr, "entry32_regs", &regs, sizeof(regs));
    }
    else {
        die("Unknown argument style\n");
    }
    return 0;
}



/* Loads additional segments in case of a panic kernel is being loaded.
 * One segment for backup region, another segment for storing elf headers
 * for crash memory image.
 */

int load_crashdump_segments(struct kexec_info *info, char* mod_cmdline,
                unsigned long max_addr, unsigned long min_base)

{
    void *tmp;
    unsigned long sz, elfcorehdr;
    int nr_ranges, align = 1024;
    struct memory_range *mem_range, *memmap_p;

    if (get_crash_memory_ranges(&mem_range, &nr_ranges,
                 info->kexec_flags) < 0)
        return -1;

    /*
     * if the core type has not been set on command line, set it here
     * automatically
     */

    if (arch_options.core_header_type == CORE_TYPE_UNDEF) {
        arch_options.core_header_type =
            get_core_type(info, mem_range, nr_ranges);
    }

    /* 1.Memory regions which panic kernel can safely use to boot into */
    sz = (sizeof(struct memory_range) * (KEXEC_MAX_SEGMENTS + 1));
    memmap_p = xmalloc(sz);
    memset(memmap_p, 0, sz);
    add_memmap(memmap_p, BACKUP_SRC_START, BACKUP_SRC_SIZE);//第一塊!

    sz = crash_reserved_mem.end - crash_reserved_mem.start +1;
    add_memmap(memmap_p, crash_reserved_mem.start, sz);//第二塊!


    /* 2.Create a backup region segment to store backup data*/
    if (!(info->kexec_flags & KEXEC_PRESERVE_CONTEXT)) {
        sz = (BACKUP_SRC_SIZE + align - 1) & ~(align - 1);
        tmp = xmalloc(sz);
        memset(tmp, 0, sz);
        info->backup_start = add_buffer(info, tmp, sz, sz, align,
                        0, max_addr, -1);
        dbgprintf("Created backup segment at 0x%lx\n",
             info->backup_start);
        if (delete_memmap(memmap_p, info->backup_start, sz) < 0)
            return -1;
    }

    /* 3.Create elf header segment and store crash image (1st or 2nd????????)data. */
    if (arch_options.core_header_type == CORE_TYPE_ELF64) {
        if (crash_create_elf64_headers(info, &elf_info64,
                     crash_memory_range, nr_ranges,
                     &tmp, &sz,
                     ELF_CORE_HEADER_ALIGN) < 0)
            return -1;
    }
    else {
        if (crash_create_elf32_headers(info, &elf_info32,
                     crash_memory_range, nr_ranges,
                     &tmp, &sz,
                     ELF_CORE_HEADER_ALIGN) < 0)//哪裏定義的??????noted by peter.guo

            return -1;
    }

    /* Hack: With some ld versions (GNU ld version 2.14.90.0.4 20030523),
     * vmlinux program headers show a gap of two pages between bss segment
     * and data segment but effectively kernel considers it as bss segment
     * and overwrites the any data placed there. Hence bloat (使膨脹)the memsz of
     * elf core header segment to 16K to avoid being placed in such gaps.
     * This is a makeshift solution until it is fixed in kernel.
     */

    elfcorehdr = add_buffer(info, tmp, sz, 16*1024, align, min_base,
                            max_addr, -1);
    dbgprintf("Created elf header segment at 0x%lx\n", elfcorehdr);
    if (delete_memmap(memmap_p, elfcorehdr, sz) < 0)
        return -1;
    cmdline_add_memmap(mod_cmdline, memmap_p);

    cmdline_add_elfcorehdr(mod_cmdline, elfcorehdr);

    //爲啥此處沒有K# 和K$形式的 ???????

    return 0;
}


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章