linux內核exec過程

簡介

本文分析linux內核exec系統調用執行過程中可執行文件的加載過程和棧的設置,內核代碼版本爲2.6.32

分析

\arch\ia64\kernel\process.c中有sys_exec函數的實現,是exec的系統調用服務例程

long
sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp,
        struct pt_regs *regs)
{
    char *fname;
    int error;
  //得到文件名字
    fname = getname(filename);
    error = PTR_ERR(fname);
    if (IS_ERR(fname))
        goto out;
    error = do_execve(fname, argv, envp, regs);
    putname(fname);
out:
    return error;
}

\fs\namei.c中有getname函數的實現,在getname中,會從slab分配器中分配空間,然後從用戶空間讀取名字。所以sys_execve的主要工作有do_execve來實現,do_execve實現在\fs\exec.c中,下面分析do_execve的實現

首先是共享打開文件描述符

 struct files_struct *displaced;
retval = unshare_files(&displaced);

unshare是linux中名稱空間的控制函數,files_struct是掛靠在進程文件描述符上的,表示一個進程打開文件的信息,包含打開文件列表等待信息。這裏的unshare_files就是複製原打開文件列表,所以說,exec後,子進程是共享父進程的打開文件列表的,包括標準輸入輸出和錯誤輸出

struct linux_binprm *bprm;
bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);

這裏動態分配了linux_binprm結構,linux_binprm是exec過程中信息的結構

/*
 * This structure is used to hold the arguments that are used when loading binaries.
 */
struct linux_binprm{
    char buf[BINPRM_BUF_SIZE];
#ifdef CONFIG_MMU
    struct vm_area_struct *vma;
#else
# define MAX_ARG_PAGES    32
    struct page *page[MAX_ARG_PAGES];
#endif
    struct mm_struct *mm;
    unsigned long p; /* current top of mem */
    unsigned int
        cred_prepared:1,/* true if creds already prepared (multiple
                 * preps happen for interpreters) */
        cap_effective:1;/* true if has elevated effective capabilities,
                 * false if not; except for init which inherits
                 * its parent's caps anyway */
#ifdef __alpha__
    unsigned int taso:1;
#endif
    unsigned int recursion_depth;
    struct file * file;
    struct cred *cred;    /* new credentials */
    int unsafe;        /* how unsafe this exec is (mask of LSM_UNSAFE_*) */
    unsigned int per_clear;    /* bits to clear in current->personality */
    int argc, envc;
    char * filename;    /* Name of binary as seen by procps */
    char * interp;        /* Name of the binary really executed. Most
                   of the time same as filename, but could be
                   different for binfmt_{misc,script} */
    unsigned interp_flags;
    unsigned interp_data;
    unsigned long loader, exec;
};
linux_binprm結構

接下來的prepare_bprm_creds新建一個cred結構,設置linux_binprm中的cred結構,就是信任狀相關內容,包含gid,uid等信息,經常用來提權

retval = prepare_bprm_creds(bprm);

然後打開文件,並初始化文件相關結構

file = open_exec(filename);
bprm->file = file;
bprm->filename = filename;
bprm->interp = filename;

建立內存管理的mm結構

retval = bprm_mm_init(bprm);
然後初始化一下參數個數和環境變量個數
bprm->argc = count(argv, MAX_ARG_STRINGS);
if ((retval = bprm->argc) < 0)
    goto out;

bprm->envc = count(envp, MAX_ARG_STRINGS);
if ((retval = bprm->envc) < 0)
    goto out;

接着是prepare_binprm函數,prepare_binprm函數檢查了文件是否可以執行,初始化了binprm中cred的幾個字段,然後還從文件中讀取了BINPRM_BUF_SIZE的內容到binprm的buf中

int prepare_binprm(struct linux_binprm *bprm)
{
    umode_t mode;
    struct inode * inode = bprm->file->f_path.dentry->d_inode;
    int retval;

    mode = inode->i_mode;
    if (bprm->file->f_op == NULL)
        return -EACCES;

    /* clear any previous set[ug]id data from a previous binary */
    bprm->cred->euid = current_euid();
    bprm->cred->egid = current_egid();

    if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
        /* Set-uid? */
        if (mode & S_ISUID) {
            bprm->per_clear |= PER_CLEAR_ON_SETID;
            bprm->cred->euid = inode->i_uid;
        }

        /* Set-gid? */
        /*
         * If setgid is set but no group execute bit then this
         * is a candidate for mandatory locking, not a setgid
         * executable.
         */
        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
            bprm->per_clear |= PER_CLEAR_ON_SETID;
            bprm->cred->egid = inode->i_gid;
        }
    }

    /* fill in binprm security blob */
    retval = security_bprm_set_creds(bprm);
    if (retval)
        return retval;
    bprm->cred_prepared = 1;

    memset(bprm->buf, 0, BINPRM_BUF_SIZE);
    return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
}
View Code

 下面是複製幾個字符串的工作

    retval = copy_strings_kernel(1, &bprm->filename, bprm);
    if (retval < 0)
        goto out;

    bprm->exec = bprm->p;
    retval = copy_strings(bprm->envc, envp, bprm);
    if (retval < 0)
        goto out;

    retval = copy_strings(bprm->argc, argv, bprm);
    if (retval < 0)
        goto out;

其中copy_string_kernel也是調用copy_string實現的,只不過是從你和中拷貝,具體實現就是使用set_fs設置段限制爲內核數據段。

看copy_string函數之前,先看看linux_binprm中的兩個字段,page和p,page表示的是存放參數的頁面數組,而p表示的是在這些數組的頂部,因爲這些字符串是按照棧的方式存放的,也就是說,先分配地址更高的數組,向低地址方向增長,p就指向棧頂部

下面copy_string的實現也就清楚了。

static int copy_strings(int argc, char __user * __user * argv,
            struct linux_binprm *bprm)
{
    struct page *kmapped_page = NULL;
    char *kaddr = NULL;
    unsigned long kpos = 0;
    int ret;
    /* 
    這裏使用的是argc不斷減少,也就是說get_user取得的是逆序的argv字符串
    */
    while (argc-- > 0) {
        char __user *str;
        int len;
        unsigned long pos;

        if (get_user(str, argv+argc) ||
                !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
            ret = -EFAULT;
            goto out;
        }

        if (!valid_arg_len(bprm, len)) {
            ret = -E2BIG;
            goto out;
        }
        /*
        p指向的是內存區域的最高長度,不斷減少
        pos也是指向字符串結尾處的偏移量
        str指向用戶態字符串結尾處
         */
        /* We're going to work our way backwords. */
        pos = bprm->p;
        str += len;
        bprm->p -= len;

        while (len > 0) {
            int offset, bytes_to_copy;
            //offset表示的是在頁內的偏移量的末尾
            offset = pos % PAGE_SIZE;
            if (offset == 0)
                offset = PAGE_SIZE;

            bytes_to_copy = offset;
            if (bytes_to_copy > len)
                bytes_to_copy = len;
            //這一步讓offset指向頁內偏移的開始位置,此時字符串應該被拷貝進offset到offset+bytes_to_copy處
            offset -= bytes_to_copy;
            pos -= bytes_to_copy;
            str -= bytes_to_copy;
            len -= bytes_to_copy;

            if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                struct page *page;
                //從bprm的page字段中取出第一個page,如果不存在則分配頁
                page = get_arg_page(bprm, pos, 1);
                if (!page) {
                    ret = -E2BIG;
                    goto out;
                }
                
                if (kmapped_page) {
                    flush_kernel_dcache_page(kmapped_page);
                    kunmap(kmapped_page);
                    put_arg_page(kmapped_page);
                }
                //建立映射到永久內存映射區,虛擬地址是kmap
                kmapped_page = page;
                kaddr = kmap(kmapped_page);
                kpos = pos & PAGE_MASK;
                flush_arg_page(bprm, kpos, kmapped_page);
            }
            //這一步從用戶空間拷貝數據進內核,這些數據存放在binprm的page中
            if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
                ret = -EFAULT;
                goto out;
            }
        }
    }
    ret = 0;
out:
    if (kmapped_page) {
        flush_kernel_dcache_page(kmapped_page);
        kunmap(kmapped_page);
        put_arg_page(kmapped_page);
    }
    return ret;
}

 接着就是尋找可執行文件的過程

retval = search_binary_handler(bprm,regs);

在linux內核中,有一個全局的鏈表formats,表示系統中所有的可執行文件格式,其中鏈上掛接的結構是linux_binfmt結構,表示一個可執行文件格式,包好了3個重要的函數,分別用來加載可執行文件、共享庫和生成core_dump核心轉儲文件,search_binary_handler就是用這個鏈上的load_binary來執行。

struct linux_binfmt {
    struct list_head lh;
    struct module *module;
    int (*load_binary)(struct linux_binprm *, struct  pt_regs * regs);
    int (*load_shlib)(struct file *);
    int (*core_dump)(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
    unsigned long min_coredump;    /* minimal dump size */
    int hasvdso;
};

其中elf的結構定義在\fs\binfmt_elf.c中,如下:

static struct linux_binfmt elf_format = {
        .module        = THIS_MODULE,
        .load_binary    = load_elf_binary,
        .load_shlib    = load_elf_library,
        .core_dump    = elf_core_dump,
        .min_coredump    = ELF_EXEC_PAGESIZE,
        .hasvdso    = 1
};

回頭看do_execve,search_binary_handler下面的內容也沒有什麼了,清除一些分配的結構等等。所以主要的加載實現是在load_elf_binary,這個函數接受了之前初始化的linux_binprm和寄存器上下文。加載可執行文件

下面看load_elf_binary函數

static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
{
    struct file *interpreter = NULL; /* to shut gcc up */
     unsigned long load_addr = 0, load_bias = 0;
    int load_addr_set = 0;
    char * elf_interpreter = NULL;
    unsigned long error;
    struct elf_phdr *elf_ppnt, *elf_phdata;
    unsigned long elf_bss, elf_brk;
    int retval, i;
    unsigned int size;
    unsigned long elf_entry;
    unsigned long interp_load_addr = 0;
    unsigned long start_code, end_code, start_data, end_data;
    unsigned long reloc_func_desc = 0;
    int executable_stack = EXSTACK_DEFAULT;
    unsigned long def_flags = 0;
    //直接在棧上分配兩個elf頭,表示可執行文件和動態鏈接器的頭
    struct {
        struct elfhdr elf_ex;
        struct elfhdr interp_elf_ex;
    } *loc;
    //分配內存
    loc = kmalloc(sizeof(*loc), GFP_KERNEL);
    if (!loc) {
        retval = -ENOMEM;
        goto out_ret;
    }

    //之前初始化bprm的時候從文件中讀取了一些數據放到buf中
    loc->elf_ex = *((struct elfhdr *)bprm->buf);

    retval = -ENOEXEC;
    //這裏做一些簡單的一致性檢查
    if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
        goto out;

    if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
        goto out;
    if (!elf_check_arch(&loc->elf_ex))
        goto out;
    if (!bprm->file->f_op||!bprm->file->f_op->mmap)
        goto out;

    /* Now read in all of the header information */
    if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr))
        goto out;
    if (loc->elf_ex.e_phnum < 1 ||
         loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr))
        goto out;
    //程序頭表的大小
    size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr);
    retval = -ENOMEM;
    //分配程序頭表的內存空間
    elf_phdata = kmalloc(size, GFP_KERNEL);
    if (!elf_phdata)
        goto out;
    //讀入程序頭表的內容
    retval = kernel_read(bprm->file, loc->elf_ex.e_phoff,
                 (char *)elf_phdata, size);
    if (retval != size) {
        if (retval >= 0)
            retval = -EIO;
        goto out_free_ph;
    }

    elf_ppnt = elf_phdata;
    elf_bss = 0;
    elf_brk = 0;

    start_code = ~0UL;
    end_code = 0;
    start_data = 0;
    end_data = 0;
    //遍歷所有的段,找到類型爲PT_INTERP的段,這個段內存放的是動態鏈接器的地址
    for (i = 0; i < loc->elf_ex.e_phnum; i++) {
        if (elf_ppnt->p_type == PT_INTERP) {
            /* This is the program interpreter used for
             * shared libraries - for now assume that this
             * is an a.out format binary
             */
            retval = -ENOEXEC;
            //驗證動態鏈接器的路徑是否符合路徑要求
            if (elf_ppnt->p_filesz > PATH_MAX || 
                elf_ppnt->p_filesz < 2)
                goto out_free_ph;

            retval = -ENOMEM;
            //分配內存存放動態鏈接器路徑
            elf_interpreter = kmalloc(elf_ppnt->p_filesz,
                          GFP_KERNEL);
            if (!elf_interpreter)
                goto out_free_ph;
            //讀取動態鏈接器路徑
            retval = kernel_read(bprm->file, elf_ppnt->p_offset,
                         elf_interpreter,
                         elf_ppnt->p_filesz);
            if (retval != elf_ppnt->p_filesz) {
                if (retval >= 0)
                    retval = -EIO;
                goto out_free_interp;
            }
            /* make sure path is NULL terminated */
            retval = -ENOEXEC;
            //確認字符串路徑最後一定是'\0'字符
            if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
                goto out_free_interp;

            /*
             * The early SET_PERSONALITY here is so that the lookup
             * for the interpreter happens in the namespace of the 
             * to-be-execed image.  SET_PERSONALITY can select an
             * alternate root.
             *
             * However, SET_PERSONALITY is NOT allowed to switch
             * this task into the new images's memory mapping
             * policy - that is, TASK_SIZE must still evaluate to
             * that which is appropriate to the execing application.
             * This is because exit_mmap() needs to have TASK_SIZE
             * evaluate to the size of the old image.
             *
             * So if (say) a 64-bit application is execing a 32-bit
             * application it is the architecture's responsibility
             * to defer changing the value of TASK_SIZE until the
             * switch really is going to happen - do this in
             * flush_thread().    - akpm
             */
            //這個應該是和執行域相關
            SET_PERSONALITY(loc->elf_ex);
            //內核裏面用來打開可執行文件的函數,返回的是file結構
            interpreter = open_exec(elf_interpreter);
            retval = PTR_ERR(interpreter);
            if (IS_ERR(interpreter))
                goto out_free_interp;

            /*
             * If the binary is not readable then enforce
             * mm->dumpable = 0 regardless of the interpreter's
             * permissions.
             */
            if (file_permission(interpreter, MAY_READ) < 0)
                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
            //這裏將bprm的buf又填充成爲動態鏈接器的內容
            retval = kernel_read(interpreter, 0, bprm->buf,
                         BINPRM_BUF_SIZE);
            if (retval != BINPRM_BUF_SIZE) {
                if (retval >= 0)
                    retval = -EIO;
                goto out_free_dentry;
            }

            /* Get the exec headers */
            //初始化頭
            loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
            break;
        }
        elf_ppnt++;
    }
    //又遍歷可執行文件,找到類型爲PT_GNU_STACK的段,這應該是表示棧上的代碼是否可以執行
    elf_ppnt = elf_phdata;
    for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++)
        if (elf_ppnt->p_type == PT_GNU_STACK) {
            if (elf_ppnt->p_flags & PF_X)
                executable_stack = EXSTACK_ENABLE_X;
            else
                executable_stack = EXSTACK_DISABLE_X;
            break;
        }

    //對動態鏈接器做一些檢查
    /* Some simple consistency checks for the interpreter */
    if (elf_interpreter) {
        retval = -ELIBBAD;
        /* Not an ELF interpreter */
        if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
            goto out_free_dentry;
        /* Verify the interpreter has a valid arch */
        if (!elf_check_arch(&loc->interp_elf_ex))
            goto out_free_dentry;
    } else {
        /* Executables without an interpreter also need a personality  */
        SET_PERSONALITY(loc->elf_ex);
    }

    /* Flush all traces of the currently running executable */
    //清空原地址空間的內容
    retval = flush_old_exec(bprm);
    if (retval)
        goto out_free_dentry;

    /* OK, This is the point of no return */
    current->flags &= ~PF_FORKNOEXEC;
    current->mm->def_flags = def_flags;

    /* Do this immediately, since STACK_TOP as used in setup_arg_pages
       may depend on the personality.  */
    SET_PERSONALITY(loc->elf_ex);
    if (elf_read_implies_exec(loc->elf_ex, executable_stack))
        current->personality |= READ_IMPLIES_EXEC;

    if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
        current->flags |= PF_RANDOMIZE;
    //選擇一個地址空間佈局
    arch_pick_mmap_layout(current->mm);

    /* Do this so that we can load the interpreter, if need be.  We will
       change some of these later */
    current->mm->free_area_cache = current->mm->mmap_base;
    current->mm->cached_hole_size = 0;
    //設置棧區vma的頁
    retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
                 executable_stack);
    if (retval < 0) {
        send_sig(SIGKILL, current, 0);
        goto out_free_dentry;
    }
    
    current->mm->start_stack = bprm->p;

    /* Now we do a little grungy work by mmaping the ELF image into
       the correct location in memory. */
    //遍歷段,找到PT_LOAD段
    for(i = 0, elf_ppnt = elf_phdata;
        i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
        int elf_prot = 0, elf_flags;
        unsigned long k, vaddr;

        if (elf_ppnt->p_type != PT_LOAD)
            continue;
        /*
        在下面的賦值中,elf_bss和elf_brk分別來表示虛擬地址加上文件偏移和內存偏移
        所以這裏的比較應該是內存大小大於文件大小,也就是有bss節的段,data段
        */
        if (unlikely (elf_brk > elf_bss)) {
            unsigned long nbyte;
                
            /* There was a PT_LOAD segment with p_memsz > p_filesz
               before this one. Map anonymous pages, if needed,
               and clear the area.  */
            //將bss節的空間設置爲有效
            retval = set_brk (elf_bss + load_bias,
                      elf_brk + load_bias);
            if (retval) {
                send_sig(SIGKILL, current, 0);
                goto out_free_dentry;
            }
            //下面的代碼將bss節清零
            nbyte = ELF_PAGEOFFSET(elf_bss);
            if (nbyte) {
                nbyte = ELF_MIN_ALIGN - nbyte;
                if (nbyte > elf_brk - elf_bss)
                    nbyte = elf_brk - elf_bss;
                if (clear_user((void __user *)elf_bss +
                            load_bias, nbyte)) {
                    /*
                     * This bss-zeroing can fail if the ELF
                     * file specifies odd protections. So
                     * we don't check the return value
                     */
                }
            }
        }

        if (elf_ppnt->p_flags & PF_R)
            elf_prot |= PROT_READ;
        if (elf_ppnt->p_flags & PF_W)
            elf_prot |= PROT_WRITE;
        if (elf_ppnt->p_flags & PF_X)
            elf_prot |= PROT_EXEC;

        elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;

        vaddr = elf_ppnt->p_vaddr;
        //一般的可執行文件應該進入if條件
        if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
            elf_flags |= MAP_FIXED;
        } else if (loc->elf_ex.e_type == ET_DYN) {
            //如果是共享目標文件,動態鏈接器的類型是這個
            /* Try and get dynamic programs out of the way of the
             * default mmap base, as well as whatever program they
             * might try to exec.  This is because the brk will
             * follow the loader, and is not movable.  */
#ifdef CONFIG_X86
            load_bias = 0;
#else
            load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
#endif
        }
        //map這個段,對於可執行文件來說,map的虛擬地址取段的虛擬地址
        error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
                elf_prot, elf_flags, 0);
        if (BAD_ADDR(error)) {
            send_sig(SIGKILL, current, 0);
            retval = IS_ERR((void *)error) ?
                PTR_ERR((void*)error) : -EINVAL;
            goto out_free_dentry;
        }

        if (!load_addr_set) {
            load_addr_set = 1;
            //這是算出整個可執行文件的虛擬位置
            load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
            if (loc->elf_ex.e_type == ET_DYN) {
                load_bias += error -
                             ELF_PAGESTART(load_bias + vaddr);
                load_addr += load_bias;
                reloc_func_desc = load_bias;
            }
        }
        k = elf_ppnt->p_vaddr;
        if (k < start_code)
            start_code = k;
        if (start_data < k)
            start_data = k;

        /*
         * Check to see if the section's size will overflow the
         * allowed task size. Note that p_filesz must always be
         * <= p_memsz so it is only necessary to check p_memsz.
         */
        if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
            elf_ppnt->p_memsz > TASK_SIZE ||
            TASK_SIZE - elf_ppnt->p_memsz < k) {
            /* set_brk can never work. Avoid overflows. */
            send_sig(SIGKILL, current, 0);
            retval = -EINVAL;
            goto out_free_dentry;
        }

        //這裏是加上的文件偏移
        k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;

        if (k > elf_bss)
            elf_bss = k;
        if ((elf_ppnt->p_flags & PF_X) && end_code < k)
            end_code = k;
        if (end_data < k)
            end_data = k;
        //這裏是虛擬地址加上內存大小
        k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
        if (k > elf_brk)
            elf_brk = k;
    }

    loc->elf_ex.e_entry += load_bias;
    elf_bss += load_bias;
    elf_brk += load_bias;
    start_code += load_bias;
    end_code += load_bias;
    start_data += load_bias;
    end_data += load_bias;

    /* Calling set_brk effectively mmaps the pages that we need
     * for the bss and break sections.  We must do this before
     * mapping in the interpreter, to make sure it doesn't wind
     * up getting placed where the bss needs to go.
     */
    retval = set_brk(elf_bss, elf_brk);
    if (retval) {
        send_sig(SIGKILL, current, 0);
        goto out_free_dentry;
    }
    if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
        send_sig(SIGSEGV, current, 0);
        retval = -EFAULT; /* Nobody gets to see this, but.. */
        goto out_free_dentry;
    }
    //如果有動態鏈接器,則入口地址需要改成動態鏈接器的地址
    if (elf_interpreter) {
        unsigned long uninitialized_var(interp_map_addr);

        elf_entry = load_elf_interp(&loc->interp_elf_ex,
                        interpreter,
                        &interp_map_addr,
                        load_bias);
        if (!IS_ERR((void *)elf_entry)) {
            /*
             * load_elf_interp() returns relocation
             * adjustment
             */
            interp_load_addr = elf_entry;
            elf_entry += loc->interp_elf_ex.e_entry;
        }
        if (BAD_ADDR(elf_entry)) {
            force_sig(SIGSEGV, current);
            retval = IS_ERR((void *)elf_entry) ?
                    (int)elf_entry : -EINVAL;
            goto out_free_dentry;
        }
        reloc_func_desc = interp_load_addr;

        allow_write_access(interpreter);
        fput(interpreter);
        kfree(elf_interpreter);
    } else {
        elf_entry = loc->elf_ex.e_entry;
        if (BAD_ADDR(elf_entry)) {
            force_sig(SIGSEGV, current);
            retval = -EINVAL;
            goto out_free_dentry;
        }
    }

    kfree(elf_phdata);

    set_binfmt(&elf_format);

#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
    retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
    if (retval < 0) {
        send_sig(SIGKILL, current, 0);
        goto out;
    }
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */

    install_exec_creds(bprm);
    current->flags &= ~PF_FORKNOEXEC;
    retval = create_elf_tables(bprm, &loc->elf_ex,
              load_addr, interp_load_addr);
    if (retval < 0) {
        send_sig(SIGKILL, current, 0);
        goto out;
    }
    /* N.B. passed_fileno might not be initialized? */
    current->mm->end_code = end_code;
    current->mm->start_code = start_code;
    current->mm->start_data = start_data;
    current->mm->end_data = end_data;
    current->mm->start_stack = bprm->p;

#ifdef arch_randomize_brk
    if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1))
        current->mm->brk = current->mm->start_brk =
            arch_randomize_brk(current->mm);
#endif

    if (current->personality & MMAP_PAGE_ZERO) {
        /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
           and some applications "depend" upon this behavior.
           Since we do not have the power to recompile these, we
           emulate the SVr4 behavior. Sigh. */
        down_write(&current->mm->mmap_sem);
        error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
                MAP_FIXED | MAP_PRIVATE, 0);
        up_write(&current->mm->mmap_sem);
    }

#ifdef ELF_PLAT_INIT
    /*
     * The ABI may specify that certain registers be set up in special
     * ways (on i386 %edx is the address of a DT_FINI function, for
     * example.  In addition, it may also specify (eg, PowerPC64 ELF)
     * that the e_entry field is the address of the function descriptor
     * for the startup routine, rather than the address of the startup
     * routine itself.  This macro performs whatever initialization to
     * the regs structure is required as well as any relocations to the
     * function descriptor entries when executing dynamically links apps.
     */
    ELF_PLAT_INIT(regs, reloc_func_desc);
#endif

    start_thread(regs, elf_entry, bprm->p);
    retval = 0;
out:
    kfree(loc);
out_ret:
    return retval;

    /* error cleanup */
out_free_dentry:
    allow_write_access(interpreter);
    if (interpreter)
        fput(interpreter);
out_free_interp:
    kfree(elf_interpreter);
out_free_ph:
    kfree(elf_phdata);
    goto out;
}
View Code

這段代碼比較長,關鍵點有這些

1、加載可執行文件,這部分實現在尋找類型爲PT_INTERP和PT_LOAD的段循環中,對於可執行文件,他的PT_INTERP段中存放動態鏈接器的地址,然後遍歷所有的PT_LOAD段,可執行文件主要有兩個段,text段和data段,這兩個段按照虛擬地址進行map。

2、加載動態鏈接器,主要實現在load_elf_interp函數,具體的加載方式也和可執行文件的加載方式十類似,找到elf文件的所有類型爲PT_LOAD的段,然後map,這裏map的地址就是從2G+段虛擬地址的地址開始map,因爲動態鏈接器的段虛擬地址是從0開始的。

3、設置棧空間,主要實現在create_elf_tables中

在create_elf_tables中對於棧空間的設置有下面這些部分

第一步:

p = arch_align_stack(p);

unsigned long arch_align_stack(unsigned long sp)
{
    if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
        sp -= get_random_int() % 8192;
    return sp & ~0xf;
}

第二步:

if (k_platform) {
    size_t len = strlen(k_platform) + 1;

    u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
    if (__copy_to_user(u_platform, k_platform, len))
        return -EFAULT;
}

第三步:

get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
u_rand_bytes = (elf_addr_t __user *)
              STACK_ALLOC(p, sizeof(k_rand_bytes));
if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
    return -EFAULT;

第四步:

sp = STACK_ADD(p, ei_index);

items = (argc + 1) + (envc + 1) + 1;
bprm->p = STACK_ROUND(sp, items);

第五步:

/* Now, let's put argc (and argv, envp if appropriate) on the stack */
if (__put_user(argc, sp++))
    return -EFAULT;

第六步:

/* Populate argv and envp */
p = current->mm->arg_end = current->mm->arg_start;
while (argc-- > 0) {
    size_t len;
    if (__put_user((elf_addr_t)p, argv++))
        return -EFAULT;
    len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
    if (!len || len > MAX_ARG_STRLEN)
        return -EINVAL;
    p += len;
}
  if (__put_user(0, argv))
  return -EFAULT;

第七步:

while (envc-- > 0) {
    size_t len;
    if (__put_user((elf_addr_t)p, envp++))
        return -EFAULT;
    len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
    if (!len || len > MAX_ARG_STRLEN)
        return -EINVAL;
    p += len;
}
if (__put_user(0, envp))
    return -EFAULT;
current->mm->env_end = p;

第八步:

sp = (elf_addr_t __user *)envp + 1;
if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t)))
    return -EFAULT;

整個棧上的空間分配應該是

position            content                         size(bytes) + comment
-------------------------------------------------------------------------------------------------

stack pointer ->    [argc = number of args]         8
                    [argv[0](pointer)]              8
                    [argv[1](pointer)]              8
                    [argv[...](pointer)]            8 * x
                    [argv[n-1](pointer)]            8
                    [argv[n](pointer)]              8 (=NULL)


                    [envp[0](pointer)]              8
                    [envp[1](pointer)]              8
                    [envp[..](pointer)]             8 * x
                    [envp[term](pointer)]           8 (=NULL)


                    [auxv[0](Elf64_auxv_t)]         16
                    [auxv[1](Elf64_auxv_t)]         16
                    [auxv[..](Elf64_auxv_t)]        16 * x
                    [auxv[term](Elf64_auxv_t)]      16 (=NULL)

                    [padding]                       >= 0

                    [rand bytes]                    16

                    [String identifying platform]   >= 0

                    [padding for align]             >= 0 (sp - (get_random_int() % 8192)) & (~0xf)

                    [argument ASCIIZ strings]       >= 0
                    [environment ASCIIZ str]        >= 0
          [file name]        >= 0

標號1處將堆棧指針向下移動了x(0 <= x <=8192), 分配出[padding for align]這部分空間

標號2的代碼爲[String identifying platform]在棧上分配空間並進行賦值操作

標號3的代碼生成16bytes大小的隨機數,然後爲[rand bytes]在棧上分配空間並進行賦值操作

標號4的代碼

sp = STACK_ADD(p, ei_index)

在棧上分配輔助向量(auxiliary vector)所需存儲空間

items = (argc + 1) + (envc + 1) + 1;
bprm->p = STACK_ROUND(sp, items);

在棧上分配argc, argv, encironment vector所需存儲空間, 值得注意的是這兩步都只是移動堆棧指針分配空間,並沒有進行賦值初始化數據操作

標號5, 6, 7, 8的代碼做了如下操作
5. 初始化棧上argc的值
6. 初始化棧上argv的值
7. 初始化棧上envp的值
8. 初始化棧上auxv的值

通過閱讀代碼可以看出執行結果與上面的結構圖是相匹配的, 可能會有疑惑的地方就是圖示中[padding]這一區域, 這一塊數據來源如下:

通過上面解釋我們可以看到標號4的代碼在棧上一次性分配了argc, argv, envp auxv所需要的空間,然後再通過堆棧指針按順序向上初始化每一塊數據, 重點在於分配的空間並不剛好等於所需空間, 因爲標號4的分配空間時使用了宏STACK_ROUND, 該宏定義如下

#define STACK_ROUND(sp, items) \
    (((unsigned long) (sp - items)) &~ 15UL)

會向下16字節對齊,因此分配的空間可能會比所需空間多,而標號5, 6 ,7 ,8的初始化操作又是從棧頂初始化的,所以最後在[auxvterm]和[rand bytes]這兩塊區域之間會多出[padding]這塊數據

參考

https://github.com/chenpengcong/blog/issues/18

https://www.cnblogs.com/joey-hua/p/5638306.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章