linux內存管理--slab

這個kmem_cache_create()函數是一個和cpu結構有關係的函數，所在在公用函數中找不到（3.10.98內核版本中），我選擇的是 arch/x86/kernel/

說明下參數：

const char *name ：slab的名稱

size_t size ：每個對象的大小

size_t align ：每個對象的對齊

unsigned long flags ：對象不夠，要申請內存時的標識

void (*ctor)(void *)：構造函數

struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
          unsigned long flags, void (*ctor)(void *)) 
{
    return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
}

看了下 3.10.98的kmem_cache_create()發現有很大的出入，還是先看看2.6.32版本的吧

/**
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * @name must be valid until the cache is destroyed. This implies that
 * the module calling this has to destroy the cache before getting unloaded.
 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
 * therefore applications must manage it themselves.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
    unsigned long flags, void (*ctor)(void *))
{
    size_t left_over, slab_size, ralign;
    struct kmem_cache *cachep = NULL, *pc; 
    gfp_t gfp; 

    /*   
     * Sanity checks... these are all serious usage bugs.
     */
    if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
        size > KMALLOC_MAX_SIZE) {//常規檢查，因爲需要爲name分配內存，在/proc/slabinfo顯示，會睡眠，所以不能在中斷上下文
        printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
                name);
        BUG();
    }

    /*
     * We use cache_chain_mutex to ensure a consistent view of
     * cpu_online_mask as well.  Please see cpuup_callback
     */
    if (slab_is_available()) {//如果slab已經有效，就需要上鎖。在前期初始化時，只有一個cpu在初始化slab，可以不加鎖
        get_online_cpus();
        mutex_lock(&cache_chain_mutex);
    }

    list_for_each_entry(pc, &cache_chain, next) {//檢查cache_chain上的所有slab，所有slab都會掛在全局變量cache_chain上
        char tmp;
        int res;

        /*
         * This happens when the module gets unloaded and doesn't
         * destroy its slab cache and no-one else reuses the vmalloc
         * area of the module.  Print a warning.
         */
        res = probe_kernel_address(pc->name, tmp);//檢查是否所有slab都有名字<span style="white-space:pre">	</span> 
        if (res) {
            printk(KERN_ERR
                   "SLAB: cache with size %d has lost its name\n",
                   pc->buffer_size);//報錯
            continue;
        }

        if (!strcmp(pc->name, name)) {//檢查下你起得名字是否已經在鏈表中了
            printk(KERN_ERR
                   "kmem_cache_create: duplicate cache %s\n", name);
            dump_stack();
            goto oops;
        }
    }

#if DEBUG
    WARN_ON(strchr(name, ' ')); /* It confuses parsers */
#if FORCED_DEBUG
    /*
     * Enable redzoning and last user accounting, except for caches with
     * large objects, if the increased size would increase the object size
     * above the next power of two: caches with object sizes just above a
     * power of two have a significant amount of internal fragmentation.
     */
    if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
                        2 * sizeof(unsigned long long)))
        flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
    if (!(flags & SLAB_DESTROY_BY_RCU))
        flags |= SLAB_POISON;
#endif
    if (flags & SLAB_DESTROY_BY_RCU)
        BUG_ON(flags & SLAB_POISON);
#endif
    /*
     * Always checks flags, a caller might be expecting debug support which
     * isn't available.
     */
    BUG_ON(flags & ~CREATE_MASK);

    /*
     * Check that size is in terms of words.  This is needed to avoid
     * unaligned accesses for some archs when redzoning is used, and makes
     * sure any on-slab bufctl's are also correctly aligned.
     *///字對齊，爲什麼不直接：size = (size + (BYTES_PER_WORD - 1)) & (~(BYTE_PER_WORD - 1))
    if (size & (BYTES_PER_WORD - 1)) {
        size += (BYTES_PER_WORD - 1);
        size &= ~(BYTES_PER_WORD - 1);
    }

    /* calculate the final buffer alignment: */

    /* 1) arch recommendation: can be overridden for debug */
    if (flags & SLAB_HWCACHE_ALIGN) {//高速緩衝行對齊
        /*
         * Default alignment: as specified by the arch code.  Except if
         * an object is really small, then squeeze multiple objects into
         * one cacheline.
         */
        ralign = cache_line_size();//有體系結構提供的函數，對齊值
        while (size <= ralign / 2)//對象比較小，則可以多幾個對象填充到緩衝行
            ralign /= 2;
    } else {
        ralign = BYTES_PER_WORD;//默認是字對齊
    }

    /*
     * Redzoning and user store require word alignment or possibly larger.
     * Note this will be overridden by architecture or caller mandated
     * alignment if either is greater than BYTES_PER_WORD.
     */
    if (flags & SLAB_STORE_USER)
        ralign = BYTES_PER_WORD;

    if (flags & SLAB_RED_ZONE) {
        ralign = REDZONE_ALIGN;
        /* If redzoning, ensure that the second redzone is suitably
         * aligned, by adjusting the object size accordingly. */
        size += REDZONE_ALIGN - 1;
        size &= ~(REDZONE_ALIGN - 1);
    }//上面的都是debug

    /* 2) arch mandated alignment */
    if (ralign < ARCH_SLAB_MINALIGN) {
        ralign = ARCH_SLAB_MINALIGN;
    }
    /* 3) caller mandated alignment */
    if (ralign < align) {//體系結構中規定的最小對齊值
        ralign = align;
    }
    /* disable debug if necessary */
    if (ralign > __alignof__(unsigned long long))
        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
    /*
     * 4) Store it.
     */
    align = ralign;

    if (slab_is_available())//如果slab生效了，則可以休眠
        gfp = GFP_KERNEL;
    else//前期初始化，則不能睡眠
        gfp = GFP_NOWAIT;

    /* Get cache's description obj. *///從cache_cache 的slab上分配一個cachep，cache_cache的slab就是爲slab分配結構體的
    cachep = kmem_cache_zalloc(&cache_cache, gfp);
    if (!cachep)
        goto oops;

#if DEBUG
    cachep->obj_size = size;

    /*
     * Both debugging options require word-alignment which is calculated
     * into align above.
     */
    if (flags & SLAB_RED_ZONE) {
        /* add space for red zone words */
        cachep->obj_offset += sizeof(unsigned long long);
        size += 2 * sizeof(unsigned long long);
    }
    if (flags & SLAB_STORE_USER) {
        /* user store requires one word storage behind the end of
         * the real object. But if the second red zone needs to be
         * aligned to 64 bits, we must allow that much space.
         */
        if (flags & SLAB_RED_ZONE)
            size += REDZONE_ALIGN;
        else
            size += BYTES_PER_WORD;
    }
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
    if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
        && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
        cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
        size = PAGE_SIZE;
    }
#endif
#endif

    /*
     * Determine if the slab management is 'on' or 'off' slab.
     * (bootstrapping cannot cope with offslab caches so don't do
     * it too early on.)
     *///開始處理slab的頭部結構體了，是存儲在slab上還是在slab外面的其他地方？？
    if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)//對象比較大（大於512）則外置，從這裏可以看出初始化時是內置的
        /*
         * Size is large, assume best to place the slab management obj
         * off-slab (should allow better packing of objs).
         */
        flags |= CFLGS_OFF_SLAB;//表示slab結構體外置

    size = ALIGN(size, align);//對齊size

 //計算碎片，具體實現看後面的函數分析

    left_over = calculate_slab_order(cachep, size, align, flags);

    if (!cachep->num) {//空對象，錯誤
        printk(KERN_ERR
               "kmem_cache_create: couldn't create cache %s.\n", name);
        kmem_cache_free(&cache_cache, cachep);
        cachep = NULL;
        goto oops;
    }
    slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
              + sizeof(struct slab), align);//slab頭結構的大小

    /*
     * If the slab has been placed off-slab, and we have enough space then
     * move it on-slab. This is at the expense of any extra colouring.
     *///充分利用碎片，如果可以的話，把slab頭放到slab上
    if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {//如果碎片大小大於slab頭結構（包括kmem_bufctl_t）
        flags &= ~CFLGS_OFF_SLAB;//變成內置的了
        left_over -= slab_size;//改變碎片大小
    }

    if (flags & CFLGS_OFF_SLAB) {
        /* really off slab. No need for manual alignment */
        slab_size =
            cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);//如果對齊了在slab中還是放不下，那就外置，不需要對齊了

#ifdef CONFIG_PAGE_POISONING
        /* If we're going to use the generic kernel_map_pages()
         * poisoning, then it's going to smash the contents of
         * the redzone and userword anyhow, so switch them off.
         */
        if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
            flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
    }
    //L1的緩衝行長度
    cachep->colour_off = cache_line_size();
    /* Offset must be a multiple of the alignment. */
    if (cachep->colour_off < align)//必須對齊
        cachep->colour_off = align;
    cachep->colour = left_over / cachep->colour_off;
    cachep->slab_size = slab_size;
    cachep->flags = flags;
    cachep->gfpflags = 0;
    if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
        cachep->gfpflags |= GFP_DMA;
    cachep->buffer_size = size;
    cachep->reciprocal_buffer_size = reciprocal_value(size);

    if (flags & CFLGS_OFF_SLAB) {
        cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
        /*
         * This is a possibility for one of the malloc_sizes caches.
         * But since we go off slab only for object size greater than
         * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
         * this should not happen at all.
         * But leave a BUG_ON for some lucky dude.
         */
        BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
    }
    cachep->ctor = ctor;
    cachep->name = name;

    if (setup_cpu_cache(cachep, gfp)) {
        __kmem_cache_destroy(cachep);
        cachep = NULL;
        goto oops;
    }

    /* cache setup completed, link it into the list */
    list_add(&cachep->next, &cache_chain);
oops:
    if (!cachep && (flags & SLAB_PANIC))
        panic("kmem_cache_create(): failed to create slab `%s'\n",
              name);
    if (slab_is_available()) {
        mutex_unlock(&cache_chain_mutex);
        put_online_cpus();
    }
    return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);

碎片計算函數分析

    left_over = calculate_slab_order(cachep, size, align, flags);

/**
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
 * @size: size of objects to be created in this cache.
 * @align: required alignment for the objects.
 * @flags: slab allocation flags
 *
 * Also calculates the number of objects per slab.
 *
 * This could be made much more intelligent.  For now, try to avoid using
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
static size_t calculate_slab_order(struct kmem_cache *cachep,
            size_t size, size_t align, unsigned long flags)
{
    unsigned long offslab_limit;
    size_t left_over = 0;
    int gfporder;

    for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {//0~10
        unsigned int num;
        size_t remainder;

        cache_estimate(gfporder, size, align, flags, &remainder, &num);
        if (!num)//對象太大，2^gfporder個內存頁不夠一個對象，所以返回 NULL
            continue;

        if (flags & CFLGS_OFF_SLAB) {
            /*
             * Max number of objs-per-slab for caches which
             * use off-slab slabs. Needed to avoid a possible
             * looping condition in cache_grow().
             *///這個網上有很多解釋的，這裏就說說自己看法，就是用一個對象去測試下kmem_bufctl_t看看該數組的大小
            offslab_limit = size - sizeof(struct slab);
            offslab_limit /= sizeof(kmem_bufctl_t);

            if (num > offslab_limit)//對象數目不能太多
                break;
        }

        /* Found something acceptable - save it away */
        cachep->num = num;//給各種成員賦值
        cachep->gfporder = gfporder;
        left_over = remainder;

        /*
         * A VFS-reclaimable slab tends to have most allocations
         * as GFP_NOFS and we really don't want to have to be allocating
         * higher-order pages when we are unable to shrink dcache.
         */
        if (flags & SLAB_RECLAIM_ACCOUNT)//如果分配的是可以回收的頁面，則不需要做下面的檢查了，大不了被回收
            break;

        /*
         * Large number of objects is good, but very large slabs are
         * currently bad for the gfp()s.
         */
        if (gfporder >= slab_break_gfp_order)//達到最大的order
            break;

        /*
         * Acceptable internal fragmentation?
         *///浪費的空間小於1/8的(page << gfporder),退出
        if (left_over * 8 <= (PAGE_SIZE << gfporder))
            break;
    }
    return left_over;
}

註釋已經說明了，計算給定的buffer size中有多少碎片

cache_estimate(gfporder, size, align, flags, &remainder, &num);

/*
 * Calculate the number of objects and left-over bytes for a given buffer size.
 */
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
               size_t align, int flags, size_t *left_over,
               unsigned int *num)
{
    int nr_objs;
    size_t mgmt_size;
    size_t slab_size = PAGE_SIZE << gfporder;//分配的內存頁

    /*
     * The slab management structure can be either off the slab or
     * on it. For the latter case, the memory allocated for a
     * slab is used for:
     *
     * - The struct slab
     * - One kmem_bufctl_t for each object
     * - Padding to respect alignment of @align
     * - @buffer_size bytes for each object
     *
     * If the slab management structure is off the slab, then the
     * alignment will already be calculated into the size. Because
     * the slabs are all pages aligned, the objects will be at the
     * correct alignment when allocated.
     */
    if (flags & CFLGS_OFF_SLAB) {//slab結構體外掛，這就比較簡單
        mgmt_size = 0;
        nr_objs = slab_size / buffer_size;//直接整除每個對象的大小

        if (nr_objs > SLAB_LIMIT)//對象的限制
            nr_objs = SLAB_LIMIT;
    } else {//slab結構體內置，會麻煩點
        /*
         * Ignore padding for the initial guess. The padding
         * is at most @align-1 bytes, and @buffer_size is at
         * least @align. In the worst case, this result will
         * be one greater than the number of objects that fit
         * into the memory allocation when taking the padding
         * into account.
         *///內置，struct  slab只有一個，而kmem_bufctl_t 就和對象一樣多了，因爲kmem_bufctl_t 就是用來查看對象是否空閒的
        nr_objs = (slab_size - sizeof(struct slab)) /
              (buffer_size + sizeof(kmem_bufctl_t));

        /*
         * This calculated number will be either the right
         * amount, or one greater than what we want.
         */
        if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
               > slab_size)//上面是沒有對齊的比較，這裏對齊後比較下是否越界，越界了就少一個對象
            nr_objs--;

        if (nr_objs > SLAB_LIMIT)
            nr_objs = SLAB_LIMIT;

        mgmt_size = slab_mgmt_size(nr_objs, align);//這是對齊後的，struct slab + nr_objs * sizeof(kmem_bufctl_t)的值
    }
    *num = nr_objs;
    *left_over = slab_size - nr_objs*buffer_size - mgmt_size;//所有的大小 - 所有對齊的對象大小 - 對齊對象的結構體和其他值
}

slab着色問題理解：http://blog.csdn.net/zqy2000zqy/article/details/1137895

linux內存管理--slab

linux panic分析

linux文件系統之讀流程 SYSCALL_DEFINE3(read, xxx)

linux內存管理--slab

linux內存管理--vmalloc

recovery: ASCII cpio archive (SVR4 with no CRC)——cpio格式的recovery解壓縮和重新打包的方

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結