Linux內核中cache的實現

本文檔的Copyleft歸yfydz所有，使用GPL發佈，可以自由拷貝，轉載，轉載時請保持文檔的完整性，

嚴禁用於任何商業用途。
msn: [email protected]
來源：http://yfydz.cublog.cn

1. 前言

kmem_cache是Linux內核提供的快速內存緩衝接口，這些內存塊要求是大小相同的，因爲分配出的內

存在接口釋放時並不真正釋放，而是作爲緩存保留，下一次請求分配時就可以直接使用，省去了各種

內存塊初始化或釋放的操作，因此分配速度很快，通常用於大數量的內存塊分配的情況，如inode節

點，skbuff頭, netfilter的連接等，其實kmalloc也是從kmem_cache中分配的，可通

過/proc/slabinfo文件直接讀取cache分配情況。

以下Linux內核代碼版本爲2.6.19.2, 程序主要出自mm/slab.c文件, 2.4和2.6基本原理差不多,但具

體實現中有了不少變化。

2. slab和page

在介紹kmem_cache之前需要先介紹page和slab這兩個定義。衆所周知，page是內核中內存基本管理單

位，每個page的內存大小是固定的，對X86機器來說，是4K；slab則是kmem_cache的具體的內存空間

形式，根據cache的對象的大小，每個slab可以有1個page到最多32(128/4)個page；如果cache對象比

一個page的空間小，這個slab中會容納多個對象以儘可能地利用空間。

struct slab {
// 鏈表
struct list_head list;
// 未用空間的偏移
unsigned long colouroff;
// 具體的內存緩衝區地址
void *s_mem; /* including colour offset */
// 每個slab中的正在使用的對象數量
unsigned int inuse; /* num of objs active in slab */
// 空閒對象
kmem_bufctl_t free;
unsigned short nodeid;
};

3. 數據結構

kmem_cache數據結構並沒有定義在.h的頭文件中，在頭文件中只是該結構的一個類型定義，因爲其他

地方使用kmem_cache時完全不需要知道其內部結構，各接口函數完全封裝結構中的信息，這是用C實

現OO編程的常用方式。

/* include/linux/slab.h */
// 這裏只是一個類型定義
typedef struct kmem_cache kmem_cache_t;

/* mm/slab.c */
// 在C文件中進行完整的定義

/*
* struct array_cache
*
* Purpose:
* - LIFO ordering, to hand out cache-warm objects from _alloc
* - reduce the number of linked list operations
* - reduce spinlock operations
*
* The limit is stored in the per-cpu structure to reduce the data cache
* footprint.
*
*/
// 這是每個CPU對應的cache數據
struct array_cache {
unsigned int avail;
unsigned int limit;
unsigned int batchcount;
unsigned int touched;
spinlock_t lock;
void *entry[0]; /*
    * Must have this definition in here for the proper
    * alignment of array_cache. Also simplifies accessing
    * the entries.
    * [0] is for gcc 2.95. It should really be [].
    */
};

/*
* The slab lists for all objects.
*/
// 這是cache管理的slab的鏈表
struct kmem_list3 {
// 該鏈表中slab中既有正在使用的對象,也有空閒對象
struct list_head slabs_partial; /* partial list first, better asm code */
// 該鏈表中slab的對象都在使用中
struct list_head slabs_full;
// 該鏈表中slab的對象都是空閒的
struct list_head slabs_free;
// 空閒的對象數
unsigned long free_objects;
// 空閒的限值,超過就該釋放掉一些了
unsigned int free_limit;
unsigned int colour_next; /* Per-node cache coloring */
spinlock_t list_lock;
struct array_cache *shared; /* shared per node */
struct array_cache **alien; /* on other nodes */
unsigned long next_reap; /* updated without locking */
int free_touched; /* updated without locking */
};

struct kmem_cache {
/* 1) per-cpu data, touched during every alloc/free */
// 每個CPU對應的cache數組
struct array_cache *array[NR_CPUS];
/* 2) Cache tunables. Protected by cache_chain_mutex */
// 沒有空閒對象時爲處理器一次批量分配的對象數量
unsigned int batchcount;
// 在將緩衝池中一半空閒對象釋放到全局緩衝池前緩衝池中允許的空閒對象的數量
unsigned int limit;
unsigned int shared;
//
unsigned int buffer_size;
/* 3) touched by every alloc & free from the backend */
// MAX_NUMNODES個cache節點鏈表,MAX_NUMNODES是編譯內核時定義的
struct kmem_list3 *nodelists[MAX_NUMNODES];

unsigned int flags; /* constant flags */
// 每個slab中的對象數
unsigned int num; /* # of objs per slab */

/* 4) cache_grow/shrink */
/* order of pgs per slab (2^n) */
// 表明在內存頁中的slab塊的大小, 如果對象大小小於4K,該值爲1
// 超過4K,該值爲slab大小相對4K的倍數, 如對於32K, 該值爲8
unsigned int gfporder;

/* force GFP flags, e.g. GFP_DMA */
gfp_t gfpflags;

size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
struct kmem_cache *slabp_cache;
unsigned int slab_size;
unsigned int dflags; /* dynamic flags */

/* constructor func */
// cache構造函數
void (*ctor) (void *, struct kmem_cache *, unsigned long);

/* de-constructor func */
// cache析構函數
void (*dtor) (void *, struct kmem_cache *, unsigned long);

/* 5) cache creation/removal */
// cache的名稱
const char *name;
// cache鏈表中的下一項
struct list_head next;

/* 6) statistics */
#if STATS
unsigned long num_active;
unsigned long num_allocations;
unsigned long high_mark;
unsigned long grown;
unsigned long reaped;
unsigned long errors;
unsigned long max_freeable;
unsigned long node_allocs;
unsigned long node_frees;
unsigned long node_overflow;
atomic_t allochit;
atomic_t allocmiss;
atomic_t freehit;
atomic_t freemiss;
#endif
#if DEBUG
/*
* If debugging is enabled, then the allocator can add additional
* fields and/or padding to every object. buffer_size contains the total
* object size including these internal fields, the following two
* variables contain the offset to the user object and its size.
*/
int obj_offset;
int obj_size;
#endif
};

內核cache的管理鏈表本身也是一個cache, 因此定義了一個靜態的cache結構作爲這個cache鏈表的鏈

表頭:

/* internal cache of cache description objs */
static struct kmem_cache cache_cache = {
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
.buffer_size = sizeof(struct kmem_cache),
.name = "kmem_cache",
#if DEBUG
.obj_size = sizeof(struct kmem_cache),
#endif
};

/proc/slabinfo就是這個cache鏈表的基本信息.

關於cache, slab, page的關係可大致表示如下:

    cache <-------------------> cache <--------------------->cache
                                  |
                                  V
                              kmem_list3
                                  |
               +--------------------------------------+
               |                  |                   |
               V                  V                   V
           slab_full         slab_partial         slab_free
               |                  |                   |
               V                  V                   V
             slab               slab                 slab
               |                  |                   |
               V                  V                   V
             page               page                 page
               |                  |                   |
       +-------------+     +--------------+     +-------------+
       |             |     |              |     |             |
       V             V     V              V     V             V
    object ... object   object ... object object ... object

4. 操作函數

4.1 基本用法

爲使用kmem_cache, 先要用kmem_cache_create函數創建cache, 如:

static kmem_cache_t *ip_conntrack_cachep __read_mostly;

ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
sizeof(struct ip_conntrack), 0,
0, NULL, NULL);

分配對象空間時使用kmem_cache_alloc函數, 如:

conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);

釋放對象時kmem_cache_free函數, 如:

kmem_cache_free(ip_conntrack_cachep, conntrack);

模塊結束,銷燬cache時使用kmem_cache_destroy函數, 如:

kmem_cache_destroy(ip_conntrack_cachep);

4.2 創建cache: kmem_cache_create

該函數創建kmem_cache結構，要提供該cache的名稱,每個單元塊的大小參數, 其他參數則可以爲0或

NULL。
這個函數重點就是根據所需要的內存塊大小確定合適的、對齊的slab塊大小

/* mm/slab.c */
// name是該cache的名稱
// size是cahce中對象的大小, 一般情況下其他參數都可爲0或NULL
// align: 指定size要按align對齊
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
unsigned long flags,
void (*ctor)(void*, struct kmem_cache *, unsigned long),
void (*dtor)(void*, struct kmem_cache *, unsigned long))
{
size_t left_over, slab_size, ralign;
struct kmem_cache *cachep = NULL, *pc;

/*
* Sanity checks... these are all serious usage bugs.
*/
// cache名不能爲空，不能在中斷中分配，每個單元塊不能太大，也不能太小
// 如果定義了析構函數不能沒有構造函數
if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
     (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
  printk(KERN_ERR "%s: Early error in slab %s/n", __FUNCTION__,
    name);
  BUG();
}

/*
* Prevent CPUs from coming and going.
* lock_cpu_hotplug() nests outside cache_chain_mutex
*/
lock_cpu_hotplug();

// 鎖住cache鏈表
mutex_lock(&cache_chain_mutex);

// 循環cache鏈表,此爲全局鏈表
list_for_each_entry(pc, &cache_chain, next) {
  mm_segment_t old_fs = get_fs();
  char tmp;
  int res;

  /*
   * This happens when the module gets unloaded and doesn't
   * destroy its slab cache and no-one else reuses the vmalloc
   * area of the module. Print a warning.
   */
// 檢查一下cache是否有效,可能會由於模塊的釋放卻沒清除掉
  set_fs(KERNEL_DS);
  res = __get_user(tmp, pc->name);
  set_fs(old_fs);
  if (res) {
   printk("SLAB: cache with size %d has lost its name/n",
          pc->buffer_size);
   continue;
  }
// 相同名稱的cache已經有了,出錯返回
  if (!strcmp(pc->name, name)) {
   printk("kmem_cache_create: duplicate cache %s/n", name);
   dump_stack();
   goto oops;
  }
}

// 可以忽略DEBUG中的代碼
#if DEBUG
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
  /* No constructor, but inital state check requested */
  printk(KERN_ERR "%s: No con, but init state check "
         "requested - %s/n", __FUNCTION__, name);
  flags &= ~SLAB_DEBUG_INITIAL;
}
#if FORCED_DEBUG
/*
* Enable redzoning and last user accounting, except for caches with
* large objects, if the increased size would increase the object size
* above the next power of two: caches with object sizes just above a
* power of two have a significant amount of internal fragmentation.
*/
if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
  flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
if (!(flags & SLAB_DESTROY_BY_RCU))
  flags |= SLAB_POISON;
#endif
if (flags & SLAB_DESTROY_BY_RCU)
  BUG_ON(flags & SLAB_POISON);
#endif
if (flags & SLAB_DESTROY_BY_RCU)
  BUG_ON(dtor);

/*
* Always checks flags, a caller might be expecting debug support which
* isn't available.
*/
BUG_ON(flags & ~CREATE_MASK);

/*
* Check that size is in terms of words. This is needed to avoid
* unaligned accesses for some archs when redzoning is used, and makes
* sure any on-slab bufctl's are also correctly aligned.
*/
// 將對象長度先按BYTES_PER_WORD擴展對齊, 32位機爲4字節對齊
if (size & (BYTES_PER_WORD - 1)) {
size += (BYTES_PER_WORD - 1);
size &= ~(BYTES_PER_WORD - 1);
}

/* calculate the final buffer alignment: */
// 以下根據函數標誌計算實際對齊值
/* 1) arch recommendation: can be overridden for debug */
if (flags & SLAB_HWCACHE_ALIGN) {
// 要根據硬件CACHE進行字節對齊,對齊都是2的指數倍
  /*
   * Default alignment: as specified by the arch code. Except if
   * an object is really small, then squeeze multiple objects into
   * one cacheline.
   */
  ralign = cache_line_size();
  while (size <= ralign / 2)
   ralign /= 2;
} else {
  ralign = BYTES_PER_WORD;
}

/*
* Redzoning and user store require word alignment. Note this will be
* overridden by architecture or caller mandated alignment if either
* is greater than BYTES_PER_WORD.
*/
if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
ralign = BYTES_PER_WORD;

/* 2) arch mandated alignment: disables debug if necessary */
if (ralign < ARCH_SLAB_MINALIGN) {
  ralign = ARCH_SLAB_MINALIGN;
  if (ralign > BYTES_PER_WORD)
   flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
/* 3) caller mandated alignment: disables debug if necessary */
if (ralign < align) {
// 如果根據系統情況計算出的對齊值小於要求的對齊值,用參數裏的對齊值
  ralign = align;
  if (ralign > BYTES_PER_WORD)
   flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
}
/*
* 4) Store it.
*/
// 真正的對齊值
align = ralign;

/* Get cache's description obj. */
// 分配cache本身的內存空間,並清零,SLAB_KERNEL標誌表明該操作可能會休眠
cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
if (!cachep)
goto oops;

#if DEBUG
cachep->obj_size = size;

/*
* Both debugging options require word-alignment which is calculated
* into align above.
*/
if (flags & SLAB_RED_ZONE) {
  /* add space for red zone words */
  cachep->obj_offset += BYTES_PER_WORD;
  size += 2 * BYTES_PER_WORD;
}
if (flags & SLAB_STORE_USER) {
  /* user store requires one word storage behind the end of
   * the real object.
   */
  size += BYTES_PER_WORD;
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
     && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
  cachep->obj_offset += PAGE_SIZE - size;
  size = PAGE_SIZE;
}
#endif
#endif

/*
* Determine if the slab management is 'on' or 'off' slab.
* (bootstrapping cannot cope with offslab caches so don't do
* it too early on.)
*/
if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
// 如果對象大小比較大,設置CFLGS_OFF_SLAB標誌
// (PAGE_SIZE >> 3)在X86下是512
  /*
   * Size is large, assume best to place the slab management obj
   * off-slab (should allow better packing of objs).
   */
  flags |= CFLGS_OFF_SLAB;

// 根據算出的對齊長度重新對齊內存單元長度
size = ALIGN(size, align);

// 計算要分配size大小相對slab大小的階數,返回每個slab的剩餘空間數
left_over = calculate_slab_order(cachep, size, align, flags);

if (!cachep->num) {
// cachep->num爲每個slab中的對象數
// 爲0表示找不到合適的內存slab塊大小
  printk("kmem_cache_create: couldn't create cache %s./n", name);
  kmem_cache_free(&cache_cache, cachep);
  cachep = NULL;
  goto oops;
}
// 對齊slab結構本身大小, 大小包括slab頭(struct slab), 以及cachep->num個對象
// 的控制量的大小, kmem_bufctl_t其實是一個無符合整數
// typedef unsigned int kmem_bufctl_t
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
     + sizeof(struct slab), align);

/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
// 有OFF_SLAB標誌而且slab剩餘空間比slab本身還大
if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
// 將slab參數移到剩餘空間中
flags &= ~CFLGS_OFF_SLAB;
left_over -= slab_size;
}

if (flags & CFLGS_OFF_SLAB) {
  /* really off slab. No need for manual alignment */
  slab_size =
      cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
}

// 填寫cache塊的基本信息
// colour_off是根據硬件L1 CACHE元素大小來定
// 是
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
if (cachep->colour_off < align)
cachep->colour_off = align;
// colour是指在剩餘空間中能用的colour_off偏移值的數量
// 表明能放幾個整的L1 CACHE元素
cachep->colour = left_over / cachep->colour_off;
// slab控制部分大小
cachep->slab_size = slab_size;
cachep->flags = flags;
cachep->gfpflags = 0;
if (flags & SLAB_CACHE_DMA)
cachep->gfpflags |= GFP_DMA;
// 實際內存緩衝區大小
cachep->buffer_size = size;

if (flags & CFLGS_OFF_SLAB) {
  cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
  /*
   * This is a possibility for one of the malloc_sizes caches.
   * But since we go off slab only for object size greater than
   * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
   * this should not happen at all.
   * But leave a BUG_ON for some lucky dude.
   */
  BUG_ON(!cachep->slabp_cache);
}
cachep->ctor = ctor;
cachep->dtor = dtor;
cachep->name = name;
// 建立每個CPU各自的cache數據
if (setup_cpu_cache(cachep)) {
  __kmem_cache_destroy(cachep);
  cachep = NULL;
  goto oops;
}

/* cache setup completed, link it into the list */
// 將新建的cache塊掛接到cache鏈表
list_add(&cachep->next, &cache_chain);
oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'/n",
name);
mutex_unlock(&cache_chain_mutex);
unlock_cpu_hotplug();
return cachep;
}
// 該函數可在內核模塊中訪問
EXPORT_SYMBOL(kmem_cache_create);

4.3 分配單元kmem_cache_(z)alloc()

有kmem_cache_alloc()和kmem_cache_zalloc()兩個函數，後者只是增加將分配出的單元空間清零的

操作。這兩個函數返回分配好的cache單元空間

/* mm/slab.c */

/**
* kmem_cache_alloc - Allocate an object
* @cachep: The cache to allocate from.
* @flags: See kmalloc().
*
* Allocate an object from this cache. The flags are only relevant
* if the cache has no available objects.
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
return __cache_alloc(cachep, flags, __builtin_return_address(0));
}
EXPORT_SYMBOL(kmem_cache_alloc);

/**
* kmem_cache_zalloc - Allocate an object. The memory is set to zero.
* @cache: The cache to allocate from.
* @flags: See kmalloc().
*
* Allocate an object from this cache and set the allocated memory to zero.
* The flags are only relevant if the cache has no available objects.
*/
void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
{
void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
if (ret)
memset(ret, 0, obj_size(cache));
return ret;
}
EXPORT_SYMBOL(kmem_cache_zalloc);

這兩個函數核心都是調用__cahce_alloc函數來分配cache:

// 兩個下劃線的cache_alloc
// 在內核配置了CONFIG_NUMA時NUMA_BUILD爲1，否則爲0
static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
gfp_t flags, void *caller)
{
unsigned long save_flags;
void *objp = NULL;

cache_alloc_debugcheck_before(cachep, flags);

local_irq_save(save_flags);

if (unlikely(NUMA_BUILD &&
current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
// 進入此處的可能性比較小
objp = alternate_node_alloc(cachep, flags);

if (!objp)
// 主要是進入該函數分配,這個是4下劃線的cache_cache
  objp = ____cache_alloc(cachep, flags);
/*
* We may just have run out of memory on the local node.
* __cache_alloc_node() knows how to locate memory on other nodes
*/
  if (NUMA_BUILD && !objp)
   objp = __cache_alloc_node(cachep, flags, numa_node_id());
local_irq_restore(save_flags);
// 實際爲objp=objp, 沒啥操作
objp = cache_alloc_debugcheck_after(cachep, flags, objp,
         caller);
prefetchw(objp);
return objp;
}

// 重點還是這個四個下劃線的cache_alloc
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;

check_irq_off();
// 每個cpu對應的cache數組
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
// 當前cache單元空間中有元素,不用重新分配,將緩衝的cache返回
  STATS_INC_ALLOCHIT(cachep);
  ac->touched = 1;
  objp = ac->entry[--ac->avail];
} else {
// 否則新分配cache單元
  STATS_INC_ALLOCMISS(cachep);
  objp = cache_alloc_refill(cachep, flags);
}
return objp;
}

// 分配cache單元
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
int node;

// cpu到node值的轉換
node = numa_node_id();

check_irq_off();
// 每個cpu對應的cache數組
ac = cpu_cache_get(cachep);
retry:
// 一次批量分配的數量, 分配是批量進行, 這樣不用每次請求都分配操作一次
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
  /*
   * If there was little recent activity on this cache, then
   * perform only a partial refill. Otherwise we could generate
   * refill bouncing.
   */
  batchcount = BATCHREFILL_LIMIT;
}
// 和CPU對應的具體list3鏈表
l3 = cachep->nodelists[node];

BUG_ON(ac->avail > 0 || !l3);
spin_lock(&l3->list_lock);

/* See if we can refill from the shared array */
// 可從共享的數組中獲取空間
if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
goto alloc_done;

// 批量循環
while (batchcount > 0) {
  struct list_head *entry;
  struct slab *slabp;
  /* Get slab alloc is to come from. */
// 從部分使用的slab鏈表中獲取鏈表元素
  entry = l3->slabs_partial.next;

  if (entry == &l3->slabs_partial) {
// 已經到鏈表頭,說明該部分使用的slab鏈表已經都用完了
// 得從空閒slab鏈表中找空間了
   l3->free_touched = 1;
   entry = l3->slabs_free.next;
   if (entry == &l3->slabs_free)
// 空閒slab鏈表也用完了, 整個cache該增加了
    goto must_grow;
  }
// 獲取可用的slab指針
  slabp = list_entry(entry, struct slab, list);
  check_slabp(cachep, slabp);
  check_spinlock_acquired(cachep);
// 從該slab塊中批量提取可用的對象數
  while (slabp->inuse < cachep->num && batchcount--) {
   STATS_INC_ALLOCED(cachep);
   STATS_INC_ACTIVE(cachep);
   STATS_SET_HIGH(cachep);
// avail記錄了實際分配出的對象數
   ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
           node);
  }
  check_slabp(cachep, slabp);

  /* move slabp to correct slabp list: */
// 把該slab先從所在鏈表斷開
  list_del(&slabp->list);
// 根據是否slab中的對象已經用完,將slab掛到全部使用鏈表或部分使用鏈表
  if (slabp->free == BUFCTL_END)
   list_add(&slabp->list, &l3->slabs_full);
  else
   list_add(&slabp->list, &l3->slabs_partial);
}

must_grow:
// 已經分配了一些對象出去, 減少空閒對象數
l3->free_objects -= ac->avail;
alloc_done:
spin_unlock(&l3->list_lock);

if (unlikely(!ac->avail)) {
// avail爲0, 表示沒有可分配的對象了, cache必須增大了
int x;
// 增加cache中內存,增加slab數
x = cache_grow(cachep, flags, node);

  /* cache_grow can reenable interrupts, then ac could change. */
  ac = cpu_cache_get(cachep);
  if (!x && ac->avail == 0) /* no objects in sight? abort */
   return NULL;

if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
// 返回對象指針
return ac->entry[--ac->avail];
}

4.4 釋放cache單元kmem_cache_free

其實不是真正完全釋放, 只是將對象空間添回cache的空閒slab鏈表中而已

/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
* @objp: The previously allocated object.
*
* Free an object which was previously allocated from this
* cache.
*/
// 其實只是__cache_free()的包裹函數
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;

BUG_ON(virt_to_cache(objp) != cachep);

local_irq_save(flags);
__cache_free(cachep, objp);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kmem_cache_free);

/*
* Release an obj back to its cache. If the obj has a constructed state, it must
* be in this state _before_ it is released. Called with disabled ints.
*/
static inline void __cache_free(struct kmem_cache *cachep, void *objp)
{
struct array_cache *ac = cpu_cache_get(cachep);

check_irq_off();
objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));

if (cache_free_alien(cachep, objp))
return;

if (likely(ac->avail < ac->limit)) {
// 空閒值小於限值
  STATS_INC_FREEHIT(cachep);
// 只是簡單將要釋放的cache單元添加到空閒單元數組中
// avail增加表示可用對象增加
  ac->entry[ac->avail++] = objp;
  return;
} else {
// 空閒數大於等於限值
  STATS_INC_FREEMISS(cachep);
// 釋放一些節點
  cache_flusharray(cachep, ac);
// 再將要釋放的cache單元添加到空閒單元數組中
// avail增加表示可用對象增加
  ac->entry[ac->avail++] = objp;
}
}

4.5 摧毀cache結構

這個一般是在模塊退出函數中進行清理工作時調用的,如果已經編到內核了, 那這個函數基本不會被

調用:

/**
* kmem_cache_destroy - delete a cache
* @cachep: the cache to destroy
*
* Remove a struct kmem_cache object from the slab cache.
*
* It is expected this function will be called by a module when it is
* unloaded. This will remove the cache completely, and avoid a duplicate
* cache being allocated each time a module is loaded and unloaded, if the
* module doesn't have persistent in-kernel storage across loads and unloads.
*
* The cache must be empty before calling this function.
*
* The caller must guarantee that noone will allocate memory from the cache
* during the kmem_cache_destroy().
*/
void kmem_cache_destroy(struct kmem_cache *cachep)
{
BUG_ON(!cachep || in_interrupt());

/* Don't let CPUs to come and go */
lock_cpu_hotplug();

/* Find the cache in the chain of caches. */
mutex_lock(&cache_chain_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
*/
// cache的第一個元素cache_cache是靜態量,該鏈表永遠不會空
// 從cache鏈表中刪除cache
list_del(&cachep->next);
mutex_unlock(&cache_chain_mutex);

// 儘可能釋放cache中的slab單元塊
if (__cache_shrink(cachep)) {
  slab_error(cachep, "Can't free all objects");
  mutex_lock(&cache_chain_mutex);
  list_add(&cachep->next, &cache_chain);
  mutex_unlock(&cache_chain_mutex);
  unlock_cpu_hotplug();
  return;
}

if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
synchronize_rcu();
// 釋放cache
__kmem_cache_destroy(cachep);
unlock_cpu_hotplug();
}
EXPORT_SYMBOL(kmem_cache_destroy);

// 真正的摧毀cache函數
static void __kmem_cache_destroy(struct kmem_cache *cachep)
{
int i;
struct kmem_list3 *l3;

// 釋放cache中所有CPU的數組
for_each_online_cpu(i)
kfree(cachep->array[i]);

/* NUMA: free the list3 structures */
// 釋放list3中的所有內存
for_each_online_node(i) {
  l3 = cachep->nodelists[i];
  if (l3) {
   kfree(l3->shared);
   free_alien_cache(l3->alien);
   kfree(l3);
  }
}
// 釋放cache本身
kmem_cache_free(&cache_cache, cachep);
}

4.6 縮減cache

該函數儘可能地釋放cache中的slab塊, 當cache空閒空間太多時會釋放掉一些內存供其他內核部分使

用.

/**
* kmem_cache_shrink - Shrink a cache.
* @cachep: The cache to shrink.
*
* Releases as many slabs as possible for a cache.
* To help debugging, a zero exit status indicates all slabs were released.
*/
// 只是一個包裹函數
int kmem_cache_shrink(struct kmem_cache *cachep)
{
BUG_ON(!cachep || in_interrupt());

return __cache_shrink(cachep);
}
EXPORT_SYMBOL(kmem_cache_shrink);

static int __cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
struct kmem_list3 *l3;
// 釋放cache中每個CPU對應的空間
drain_cpu_caches(cachep);

check_irq_on();
for_each_online_node(i) {
// 釋放每個節點的list3
  l3 = cachep->nodelists[i];
  if (!l3)
   continue;
// 將slab從slab_free中釋放
  drain_freelist(cachep, l3, l3->free_objects);

ret += !list_empty(&l3->slabs_full) ||
!list_empty(&l3->slabs_partial);
}
return (ret ? 1 : 0);
}

static void drain_cpu_caches(struct kmem_cache *cachep)
{
struct kmem_list3 *l3;
int node;

on_each_cpu(do_drain, cachep, 1, 1);
check_irq_on();

for_each_online_node(node) {
  l3 = cachep->nodelists[node];
  if (l3 && l3->alien)
// 釋放cache的list3的alien部分
   drain_alien_cache(cachep, l3->alien);
}

for_each_online_node(node) {
  l3 = cachep->nodelists[node];
  if (l3)
// 釋放list3的數組空間
   drain_array(cachep, l3, l3->shared, 1, node);
}
}

/*
* Remove slabs from the list of free slabs.
* Specify the number of slabs to drain in tofree.
*
* Returns the actual number of slabs released.
*/
static int drain_freelist(struct kmem_cache *cache,
struct kmem_list3 *l3, int tofree)
{
struct list_head *p;
int nr_freed;
struct slab *slabp;

nr_freed = 0;
// 從slabs_free鏈表釋放
while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {

  spin_lock_irq(&l3->list_lock);
  p = l3->slabs_free.prev;
  if (p == &l3->slabs_free) {
   spin_unlock_irq(&l3->list_lock);
   goto out;
  }
// 獲取slab
  slabp = list_entry(p, struct slab, list);
#if DEBUG
  BUG_ON(slabp->inuse);
#endif
// 將slab從鏈表中刪除
  list_del(&slabp->list);
  /*
   * Safe to drop the lock. The slab is no longer linked
   * to the cache.
   */
// 空閒對象數減少一個slab中的對象數
  l3->free_objects -= cache->num;
  spin_unlock_irq(&l3->list_lock);
// 釋放slab
  slab_destroy(cache, slabp);
  nr_freed++;
}
out:
return nr_freed;
}

4.7 kmalloc和kfree

kmalloc也是通過cache來實現的, 只不過每此kmalloc的大小不同, 因此是從不同的cache中分配:

/* include/linux/slab.h */

// 注意kmalloc是在頭文件中定義的
static inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
// 以下是找一個對象大小剛好大於等於size的cache
  int i = 0;
#define CACHE(x) /
  if (size <= x) /
   goto found; /
  else /
   i++;
#include "kmalloc_sizes.h"
#undef CACHE
  {
   extern void __you_cannot_kmalloc_that_much(void);
   __you_cannot_kmalloc_that_much();
  }
found:
// 實際還是通過kmem_cache_alloc來分配內存空間, 因此也是cache
  return kmem_cache_alloc((flags & GFP_DMA) ?
   malloc_sizes[i].cs_dmacachep :
   malloc_sizes[i].cs_cachep, flags);
}
// 通過該函數最後也是由__cache_alloc()函數來分配空間
return __kmalloc(size, flags);
}

// 這是kmalloc_sizes.h文件內容, 實際就是定義CACHE中可用的對象大小
// 普通情況下最大是128K, 也就是kmalloc能分配的最大內存量

#if (PAGE_SIZE == 4096)
CACHE(32)
#endif
CACHE(64)
#if L1_CACHE_BYTES < 64
CACHE(96)
#endif
CACHE(128)
#if L1_CACHE_BYTES < 128
CACHE(192)
#endif
CACHE(256)
CACHE(512)
CACHE(1024)
CACHE(2048)
CACHE(4096)
CACHE(8192)
CACHE(16384)
CACHE(32768)
CACHE(65536)
CACHE(131072)
#if (NR_CPUS > 512) || (MAX_NUMNODES > 256) || !defined(CONFIG_MMU)
CACHE(262144)
#endif
#ifndef CONFIG_MMU
CACHE(524288)
CACHE(1048576)
#ifdef CONFIG_LARGE_ALLOCS
CACHE(2097152)
CACHE(4194304)
CACHE(8388608)
CACHE(16777216)
CACHE(33554432)
#endif /* CONFIG_LARGE_ALLOCS */
#endif /* CONFIG_MMU
/* mm/slab.c */

// kfree實際也是調用__cache_free來釋放空間
void kfree(const void *objp)
{
struct kmem_cache *c;
unsigned long flags;

if (unlikely(!objp))
return;
local_irq_save(flags);
kfree_debugcheck(objp);
c = virt_to_cache(objp);
debug_check_no_locks_freed(objp, obj_size(c));
__cache_free(c, (void *)objp);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);

5. 結論

cache的使用使得在頻繁增加刪除對象的處理效率得到提高, 這也就是爲什麼普通情況下

從/proc/meminfo中看Linux的空閒內存不多的原因,因爲很多內存都是cache的,沒有真正釋放

Linux內核中cache的實現

微軟的開發管理經驗：100%以Bug爲核心

Ubuntu/Debian與REDHAT系統啓動腳本之對照

解析Linux內核獲取當前進程指針的方法

debian上安裝nfs服務器

jiffies volatile

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結