這個kmem_cache_create()函數是一個和cpu結構有關係的函數,所在在公用函數中找不到(3.10.98內核版本中),我選擇的是 arch/x86/kernel/
說明下參數:
const char *name :slab的名稱
size_t size :每個對象的大小
size_t align :每個對象的對齊
unsigned long flags :對象不夠,要申請內存時的標識
void (*ctor)(void *):構造函數
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
}
看了下 3.10.98的kmem_cache_create()發現有很大的出入,還是先看看2.6.32版本的吧
/**
* kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
* @flags: SLAB flags
* @ctor: A constructor for the objects.
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a int, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* @name must be valid until the cache is destroyed. This implies that
* the module calling this has to destroy the cache before getting unloaded.
* Note that kmem_cache_name() is not guaranteed to return the same pointer,
* therefore applications must manage it themselves.
*
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.
*
* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
* for buffer overruns.
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;
gfp_t gfp;
/*
* Sanity checks... these are all serious usage bugs.
*/
if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
size > KMALLOC_MAX_SIZE) {//常規檢查,因爲需要爲name分配內存,在/proc/slabinfo顯示,會睡眠,所以不能在中斷上下文
printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
name);
BUG();
}
/*
* We use cache_chain_mutex to ensure a consistent view of
* cpu_online_mask as well. Please see cpuup_callback
*/
if (slab_is_available()) {//如果slab已經有效,就需要上鎖。在前期初始化時,只有一個cpu在初始化slab,可以不加鎖
get_online_cpus();
mutex_lock(&cache_chain_mutex);
}
list_for_each_entry(pc, &cache_chain, next) {//檢查cache_chain上的所有slab,所有slab都會掛在全局變量cache_chain上
char tmp;
int res;
/*
* This happens when the module gets unloaded and doesn't
* destroy its slab cache and no-one else reuses the vmalloc
* area of the module. Print a warning.
*/
res = probe_kernel_address(pc->name, tmp);//檢查是否所有slab都有名字<span style="white-space:pre"> </span>
if (res) {
printk(KERN_ERR
"SLAB: cache with size %d has lost its name\n",
pc->buffer_size);//報錯
continue;
}
if (!strcmp(pc->name, name)) {//檢查下你起得名字是否已經在鏈表中了
printk(KERN_ERR
"kmem_cache_create: duplicate cache %s\n", name);
dump_stack();
goto oops;
}
}
#if DEBUG
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
#if FORCED_DEBUG
/*
* Enable redzoning and last user accounting, except for caches with
* large objects, if the increased size would increase the object size
* above the next power of two: caches with object sizes just above a
* power of two have a significant amount of internal fragmentation.
*/
if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2 * sizeof(unsigned long long)))
flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
#endif
if (flags & SLAB_DESTROY_BY_RCU)
BUG_ON(flags & SLAB_POISON);
#endif
/*
* Always checks flags, a caller might be expecting debug support which
* isn't available.
*/
BUG_ON(flags & ~CREATE_MASK);
/*
* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*///字對齊,爲什麼不直接:size = (size + (BYTES_PER_WORD - 1)) & (~(BYTE_PER_WORD - 1))
if (size & (BYTES_PER_WORD - 1)) {
size += (BYTES_PER_WORD - 1);
size &= ~(BYTES_PER_WORD - 1);
}
/* calculate the final buffer alignment: */
/* 1) arch recommendation: can be overridden for debug */
if (flags & SLAB_HWCACHE_ALIGN) {//高速緩衝行對齊
/*
* Default alignment: as specified by the arch code. Except if
* an object is really small, then squeeze multiple objects into
* one cacheline.
*/
ralign = cache_line_size();//有體系結構提供的函數,對齊值
while (size <= ralign / 2)//對象比較小,則可以多幾個對象填充到緩衝行
ralign /= 2;
} else {
ralign = BYTES_PER_WORD;//默認是字對齊
}
/*
* Redzoning and user store require word alignment or possibly larger.
* Note this will be overridden by architecture or caller mandated
* alignment if either is greater than BYTES_PER_WORD.
*/
if (flags & SLAB_STORE_USER)
ralign = BYTES_PER_WORD;
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
* aligned, by adjusting the object size accordingly. */
size += REDZONE_ALIGN - 1;
size &= ~(REDZONE_ALIGN - 1);
}//上面的都是debug
/* 2) arch mandated alignment */
if (ralign < ARCH_SLAB_MINALIGN) {
ralign = ARCH_SLAB_MINALIGN;
}
/* 3) caller mandated alignment */
if (ralign < align) {//體系結構中規定的最小對齊值
ralign = align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
*/
align = ralign;
if (slab_is_available())//如果slab生效了,則可以休眠
gfp = GFP_KERNEL;
else//前期初始化,則不能睡眠
gfp = GFP_NOWAIT;
/* Get cache's description obj. *///從cache_cache 的slab上分配一個cachep,cache_cache的slab就是爲slab分配結構體的
cachep = kmem_cache_zalloc(&cache_cache, gfp);
if (!cachep)
goto oops;
#if DEBUG
cachep->obj_size = size;
/*
* Both debugging options require word-alignment which is calculated
* into align above.
*/
if (flags & SLAB_RED_ZONE) {
/* add space for red zone words */
cachep->obj_offset += sizeof(unsigned long long);
size += 2 * sizeof(unsigned long long);
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
* the real object. But if the second red zone needs to be
* aligned to 64 bits, we must allow that much space.
*/
if (flags & SLAB_RED_ZONE)
size += REDZONE_ALIGN;
else
size += BYTES_PER_WORD;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
&& cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
size = PAGE_SIZE;
}
#endif
#endif
/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on.)
*///開始處理slab的頭部結構體了,是存儲在slab上還是在slab外面的其他地方??
if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)//對象比較大(大於512)則外置,從這裏可以看出初始化時是內置的
/*
* Size is large, assume best to place the slab management obj
* off-slab (should allow better packing of objs).
*/
flags |= CFLGS_OFF_SLAB;//表示slab結構體外置
size = ALIGN(size, align);//對齊size
//計算碎片,具體實現看後面的函數分析
left_over = calculate_slab_order(cachep, size, align, flags);
if (!cachep->num) {//空對象,錯誤
printk(KERN_ERR
"kmem_cache_create: couldn't create cache %s.\n", name);
kmem_cache_free(&cache_cache, cachep);
cachep = NULL;
goto oops;
}
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ sizeof(struct slab), align);//slab頭結構的大小
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*///充分利用碎片,如果可以的話,把slab頭放到slab上
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {//如果碎片大小大於slab頭結構(包括kmem_bufctl_t)
flags &= ~CFLGS_OFF_SLAB;//變成內置的了
left_over -= slab_size;//改變碎片大小
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
slab_size =
cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);//如果對齊了在slab中還是放不下,那就外置,不需要對齊了
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
* poisoning, then it's going to smash the contents of
* the redzone and userword anyhow, so switch them off.
*/
if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
}
//L1的緩衝行長度
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < align)//必須對齊
cachep->colour_off = align;
cachep->colour = left_over / cachep->colour_off;
cachep->slab_size = slab_size;
cachep->flags = flags;
cachep->gfpflags = 0;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->gfpflags |= GFP_DMA;
cachep->buffer_size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
/*
* This is a possibility for one of the malloc_sizes caches.
* But since we go off slab only for object size greater than
* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
* this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
cachep->ctor = ctor;
cachep->name = name;
if (setup_cpu_cache(cachep, gfp)) {
__kmem_cache_destroy(cachep);
cachep = NULL;
goto oops;
}
/* cache setup completed, link it into the list */
list_add(&cachep->next, &cache_chain);
oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
if (slab_is_available()) {
mutex_unlock(&cache_chain_mutex);
put_online_cpus();
}
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
碎片計算函數分析
left_over = calculate_slab_order(cachep, size, align, flags);
/**
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
* @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
*
* This could be made much more intelligent. For now, try to avoid using
* high order pages for slabs. When the gfp() functions are more friendly
* towards high-order requests, this should be changed.
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {//0~10
unsigned int num;
size_t remainder;
cache_estimate(gfporder, size, align, flags, &remainder, &num);
if (!num)//對象太大,2^gfporder個內存頁不夠一個對象,所以返回 NULL
continue;
if (flags & CFLGS_OFF_SLAB) {
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*///這個網上有很多解釋的,這裏就說說自己看法,就是用一個對象去測試下kmem_bufctl_t看看該數組的大小
offslab_limit = size - sizeof(struct slab);
offslab_limit /= sizeof(kmem_bufctl_t);
if (num > offslab_limit)//對象數目不能太多
break;
}
/* Found something acceptable - save it away */
cachep->num = num;//給各種成員賦值
cachep->gfporder = gfporder;
left_over = remainder;
/*
* A VFS-reclaimable slab tends to have most allocations
* as GFP_NOFS and we really don't want to have to be allocating
* higher-order pages when we are unable to shrink dcache.
*/
if (flags & SLAB_RECLAIM_ACCOUNT)//如果分配的是可以回收的頁面,則不需要做下面的檢查了,大不了被回收
break;
/*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
if (gfporder >= slab_break_gfp_order)//達到最大的order
break;
/*
* Acceptable internal fragmentation?
*///浪費的空間小於1/8的(page << gfporder),退出
if (left_over * 8 <= (PAGE_SIZE << gfporder))
break;
}
return left_over;
}
註釋已經說明了,計算給定的buffer size中有多少碎片
cache_estimate(gfporder, size, align, flags, &remainder, &num);
/*
* Calculate the number of objects and left-over bytes for a given buffer size.
*/
static void cache_estimate(unsigned long gfporder, size_t buffer_size,
size_t align, int flags, size_t *left_over,
unsigned int *num)
{
int nr_objs;
size_t mgmt_size;
size_t slab_size = PAGE_SIZE << gfporder;//分配的內存頁
/*
* The slab management structure can be either off the slab or
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
* - The struct slab
* - One kmem_bufctl_t for each object
* - Padding to respect alignment of @align
* - @buffer_size bytes for each object
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
if (flags & CFLGS_OFF_SLAB) {//slab結構體外掛,這就比較簡單
mgmt_size = 0;
nr_objs = slab_size / buffer_size;//直接整除每個對象的大小
if (nr_objs > SLAB_LIMIT)//對象的限制
nr_objs = SLAB_LIMIT;
} else {//slab結構體內置,會麻煩點
/*
* Ignore padding for the initial guess. The padding
* is at most @align-1 bytes, and @buffer_size is at
* least @align. In the worst case, this result will
* be one greater than the number of objects that fit
* into the memory allocation when taking the padding
* into account.
*///內置,struct slab只有一個,而kmem_bufctl_t 就和對象一樣多了,因爲kmem_bufctl_t 就是用來查看對象是否空閒的
nr_objs = (slab_size - sizeof(struct slab)) /
(buffer_size + sizeof(kmem_bufctl_t));
/*
* This calculated number will be either the right
* amount, or one greater than what we want.
*/
if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
> slab_size)//上面是沒有對齊的比較,這裏對齊後比較下是否越界,越界了就少一個對象
nr_objs--;
if (nr_objs > SLAB_LIMIT)
nr_objs = SLAB_LIMIT;
mgmt_size = slab_mgmt_size(nr_objs, align);//這是對齊後的,struct slab + nr_objs * sizeof(kmem_bufctl_t)的值
}
*num = nr_objs;
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;//所有的大小 - 所有對齊的對象大小 - 對齊對象的結構體和其他值
}
slab着色問題理解:http://blog.csdn.net/zqy2000zqy/article/details/1137895