memcahce採用了內存頁面,內存頁面上的內存塊技術實現了內存管理器,對item的指針採用了hashtable的方法,通過item的key值實現快速定位查找item指針的方法,這裏詳細剖析一下實現的關鍵代碼.
首先是內存管理,在Slabs.c代碼中實現.
(1).void slabs_init()
void slabs_init(const size_t limit, const double factor, const bool prealloc) { int i = POWER_SMALLEST - 1; //POWER_SMALLEST:定義的值是1 //初始化設置時 settings.chunk_size = 48 unsigned int size = sizeof(item) + settings.chunk_size; /* Factor of 2.0 means use the default memcached behavior */ if (factor == 2.0 && size < 128) { size = 128; } mem_limit = limit; if (prealloc) { /* Allocate everything in a big chunk with malloc */ mem_base = malloc(mem_limit); //分配256M的內存 if (mem_base != NULL) { mem_current = mem_base; //當前mem_current指向mem_base mem_avail = mem_limit; //初始化時可得到的內存大小mem_avail } else { fprintf(stderr, "Warning: Failed to allocate requested memory in" " one large chunk./nWill allocate in smaller chunks/n"); } } memset(slabclass, 0, sizeof(slabclass)); //200個slab,每個slab是1M //POWER_BLOCK 設置的值是 1048576=1024*1024=1M //循環計算每個內存集合的內存單元大小和每個內存頁所包含的內存單元數目 //如果當前的內存單元的大小超過0.5M,就好停止擴展,並在最後增加一個內存單元爲1M的集合 //在memcached中內存頁大小的上限是1M) while (++i < POWER_LARGEST && size <= POWER_BLOCK / 2) { /* Make sure items are always n-byte aligned */ //size是8個字節對齊的 CHUNK_ALIGN_BYTES 8 //對於每個內存頁的大小是與sizeof(void*)對齊的,而且內存集合的內存單元大小是以factor因子增加的 if (size % CHUNK_ALIGN_BYTES) //如果不能被8除盡,那麼對size進行修正 { size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); } slabclass[i].size = size; //每個內存集合的內存單元大小 slabclass[i].perslab = POWER_BLOCK / slabclass[i].size; //每個內存頁所包含的內存單元數目 size *= factor; if (settings.verbose > 1) { fprintf(stderr, "slab class %3d: chunk size %6u perslab %5u/n",i, slabclass[i].size, slabclass[i].perslab); } } power_largest = i; slabclass[power_largest].size = POWER_BLOCK; slabclass[power_largest].perslab = 1; /* for the test suite: faking of how much we've already malloc'd */ { char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC"); if (t_initial_malloc) { mem_malloced = (size_t)atol(t_initial_malloc); }
} #ifndef DONT_PREALLOC_SLABS { char *pre_alloc = getenv("T_MEMD_SLABS_ALLOC"); //如果編譯的時候沒有定義DONT_PREALLOC_SLABS而且環境變量中也沒有定義T_MEMD_SLABS_ALLOC,memcached就會進行內存的預分配 if (pre_alloc == NULL || atoi(pre_alloc) != 0) { slabs_preallocate(power_largest); } } #endif } |
(2).void slabs_preallocate():
#ifndef DONT_PREALLOC_SLABS static void slabs_preallocate (const unsigned int maxslabs) { int i; unsigned int prealloc = 0;
/* pre-allocate a 1MB slab in every size class so people don't get confused by non-intuitive "SERVER_ERROR out of memory" messages. this is the most common question on the mailing list. if you really don't want this, you can rebuild without these three lines. */
for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) { if (++prealloc > maxslabs) return; do_slabs_newslab(i); //預分配 }
} #endif |
(3).int grow_slab_list():
//當一個slab(內存頁面)用光後,又有新的item要插入這個id,那麼它就會重新申請新的slab,申請新的slab時,對應id的slab鏈表就要增長,這個鏈表是成倍增長的, //在函數grow_slab_list函數中,這個鏈的長度從1變成2,從2變成4,從4變成8…… static int grow_slab_list (const unsigned int id) { slabclass_t *p = &slabclass[id]; //p->slabs頁面個數 if (p->slabs == p->list_size) { size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16; //初始時new_size=16,下一次變爲16*2,在下次就是16*2*2 //原型: extern void *realloc(void *mem_address, unsigned int newsize); //功能: 先釋放原來mem_address所指內存區域,並按照newsize指定的大小重新分配空間,同時將原有數據從頭到尾拷貝到新分配的內存區域, //並返回該內存區域的首地址。即重新分配存儲器塊。 void *new_list = realloc(p->slab_list, new_size * sizeof(void *)); if (new_list == 0) return 0; p->list_size = new_size; p->slab_list = new_list; //用於頁面的指針鏈表 } return 1; } |
(4). int do_slabs_newslab()
//該函數分配一個新的內存頁,每個slabclass_t會有多個頁面 static int do_slabs_newslab(const unsigned int id) { slabclass_t *p = &slabclass[id]; #ifdef ALLOW_SLABS_REASSIGN int len = POWER_BLOCK; #else int len = p->size * p->perslab; //長度=每塊的大小*每個內存頁所包含的內存單元數目 #endif char *ptr;
if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||(grow_slab_list(id) == 0) || ((ptr = memory_allocate((size_t)len)) == 0)) {
MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id); return 0; } //確認malloc成功後初時化它的值爲0(memset) memset(ptr, 0, (size_t)len); //將end_page_ptr指向新的內存頁,並把它加入到內存頁數組中,同時修改對應的計算變量(這個函數是memcached中唯一分配"用戶可用內存"的地方, //"用戶可用"是指set/update/replace指令可以控制的內存) p->end_page_ptr = ptr; //最後一個slab空閒內存起始地址 p->end_page_free = p->perslab; //最後一個slab空閒區能存放的item個數 p->slab_list[p->slabs++] = ptr; //鏈表裏的元素所指向的地址:記錄頁面指向的地址,見代碼194行的頁面地址鏈表 mem_malloced += len; //修改已分配內存大小的值 MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id); return 1; } |
(5). void *do_slabs_alloc()
//內存單元:集合中維護的"邏輯內存塊"的大小,它是以8字節對齊的 //內存頁: slabclass_t分配內存的時候是以perslab個內存單元分配的,這perslab個連續的內存單元就是內存頁 //slabs_alloc是一個宏,對於多線程模式和單線程模式,它會映射到不同的函數 void *do_slabs_alloc(const size_t size, unsigned int id) { slabclass_t *p; void *ret = NULL; //根據需要的大小查找對應的slabclass_t結構 if (id < POWER_SMALLEST || id > power_largest) { MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0); return NULL; } p = &slabclass[id]; //檢查內存單元指針數組slots是否爲空,如果非空-->返回一個空的內存單元 assert(p->sl_curr == 0 || ((item *)p->slots[p->sl_curr - 1])->slabs_clsid == 0); //如果使用系統分配,直接調用操作系統的malloc函數 #ifdef USE_SYSTEM_MALLOC if (mem_limit && mem_malloced + size > mem_limit) { MEMCACHED_SLABS_ALLOCATE_FAILED(size, id); return 0; } mem_malloced += size; ret = malloc(size); MEMCACHED_SLABS_ALLOCATE(size, id, 0, ret); return ret; #endif /* fail unless we have space at the end of a recently allocated page, we have something on our freelist, or we could allocate a new page */ //檢查是否分配了新的內存頁, 如果是-->返回一個"新的"內存單元(沒有加入到slots中),如果新的內存頁爲空,那麼調用do_slabs_newslab從系統分配內存 //(當然do_slabs_alloc還會修改對應的計數變量) //先從本slab中申請,如果沒有內存的話就去slot裏面找,如果還沒有找到的話就要new新的了 //如果這些辦法都失敗了,iteam_alloc就需用動動LRU的腦筋了,它會從尾部循環50次,看看沒有可以釋放的item if (! (p->end_page_ptr != 0 || p->sl_curr != 0 ||do_slabs_newslab(id) != 0)) { /* We don't have more memory available */ ret = NULL; } else if (p->sl_curr != 0) { /* return off our freelist */ //從空閒裏面去取 ret = p->slots[--p->sl_curr]; } else { /* if we recently allocated a whole page, return from that */ assert(p->end_page_ptr != NULL); ret = p->end_page_ptr; //返回新生成的地址 //p->end_page_free:空閒數目減少1個 if (--p->end_page_free != 0) { p->end_page_ptr += p->size; //改id的指針偏移size個單位 } else { p->end_page_ptr = 0; } } if (ret) { MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret); } else { MEMCACHED_SLABS_ALLOCATE_FAILED(size, id); } return ret; } |
(6). void do_slabs_free()
void do_slabs_free(void *ptr, const size_t size, unsigned int id) { slabclass_t *p; assert(((item *)ptr)->slabs_clsid == 0); assert(id >= POWER_SMALLEST && id <= power_largest); if (id < POWER_SMALLEST || id > power_largest) return; MEMCACHED_SLABS_FREE(size, id, ptr); p = &slabclass[id]; #ifdef USE_SYSTEM_MALLOC mem_malloced -= size; //已經分配的內存數減去釋放的內存大小 free(ptr); return; #endif //將釋放的內存加入到內存單元數組中:(可以看到memcached是不真正釋放內存的,而且它的分配與釋放操作都是很簡單的指針賦值操作 //開始釋放的時候都是p->sl_curr= p->sl_total=0,因此執行它,得到空閒鏈表,空閒的內存單元的數目 if (p->sl_curr == p->sl_total) { /* need more space on the free list */ int new_size = (p->sl_total != 0) ? p->sl_total * 2 : 16; /* 16 is arbitrary */ void **new_slots = realloc(p->slots, new_size * sizeof(void *)); if (new_slots == 0) return; p->slots = new_slots; p->sl_total = new_size; } p->slots[p->sl_curr++] = ptr; //p->sl_curr自增加1,並且記錄該指針 return; } |
(7). void *memory_allocate()
//從mem_base中分配size大小的內存 static void *memory_allocate(size_t size) { void *ret; if (mem_base == NULL) { /* We are not using a preallocated large memory chunk */ ret = malloc(size); } else { ret = mem_current; //如果分配大小size大於內存可以得到的大小,直接返回NULL if (size > mem_avail) { return NULL; } /* mem_current pointer _must_ be aligned!!! */ //如果不能被8整除,修正size的大小 if (size % CHUNK_ALIGN_BYTES) { size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES); } mem_current += size; //當前內存指針偏移size if (size < mem_avail) { mem_avail -= size; //修改內存剩餘大小的值 } else { mem_avail = 0; } } return ret; } |
然後是通過item的key值實現快速定位item指針地址的hashtabel,在assoc.c中。
(1). void assoc_init()
/*hashtable的初始化,計算hashtable的大小-->分配空間-->初始化空間爲NULL*/ //分配hashtable的所需的內存 void assoc_init(void) { //65535個hashtable /*在內存的動態存儲區中分配n個長度爲size的連續空間,函數返回一個指向分配起始地址的指針;如果分配不成功,返回NULL*/ /*用 法: void *calloc(unsigned n,unsigned size)*/ /*hashpower: 16 */ /*hashsize(n) ((ub4)1<<(n))* 將1左移16位: 16*16*16*16=65535個hashtable */ primary_hashtable = calloc(hashsize(hashpower), sizeof(void *)); if (! primary_hashtable) { fprintf(stderr, "Failed to init hashtable./n"); exit(EXIT_FAILURE); } } |
(2). item *assoc_find()
//根據鍵尋找對應的值 item *assoc_find(const char *key, const size_t nkey) { //根據key和key_len計算hash值 uint32_t hv = hash(key, nkey, 0); item *it; unsigned int oldbucket; //根據hash值和掩碼計算hashtable的下標 //如果當前處於hashtable的擴展過程,並且下標值小於數據遷移的記錄值,那麼就從新的hashtable中獲得該下標對應的item鏈表,否則 //就從原來的hashtable中獲得item鏈表 if (expanding &&(oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket) { it = old_hashtable[oldbucket]; } else { it = primary_hashtable[hv & hashmask(hashpower)]; } //循環對比鏈表中的item的key尋找對應的item item *ret = NULL; int depth = 0; //桶裏面是鏈表 while (it) { if ((nkey == it->nkey) && (memcmp(key, ITEM_key(it), nkey) == 0)) { ret = it; break; } it = it->h_next; ++depth; } MEMCACHED_ASSOC_FIND(key, depth); return ret; } |
(3). item** _hashitem_before()
//尋找key對應的元素的指針變量的地址 static item** _hashitem_before (const char *key, const size_t nkey) { uint32_t hv = hash(key, nkey, 0); item **pos; unsigned int oldbucket; if (expanding && (oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket) { pos = &old_hashtable[oldbucket]; } else { pos = &primary_hashtable[hv & hashmask(hashpower)]; } //對桶裏面的鏈表做循環處理 while (*pos && ((nkey != (*pos)->nkey) || memcmp(key, ITEM_key(*pos), nkey))) { pos = &(*pos)->h_next; } return pos; } |
(4). void assoc_expand()
static void assoc_expand(void) { old_hashtable = primary_hashtable; //指針的作用只是指向內存中的一段地址 primary_hashtable = calloc(hashsize(hashpower + 1), sizeof(void *)); //每次都是進行2倍的容量擴展 if (primary_hashtable) { if (settings.verbose > 1) fprintf(stderr, "Hash table expansion starting/n"); hashpower++; expanding = true; expand_bucket = 0; do_assoc_move_next_bucket(); } else { primary_hashtable = old_hashtable; /* Bad news, but we can keep running. */ } } |
(5). void do_assoc_move_next_bucket()
//被static void conn_set_state()調用 void do_assoc_move_next_bucket(void) { item *it, *next; int bucket; //將hashtable中的第一個下標的item列表重新計算hash值並移到新的hashtable中, if (expanding) { //這裏只移動了一個下標的item鏈表do_assoc_move_next_bucket for (it = old_hashtable[expand_bucket]; NULL != it; it = next) { next = it->h_next; //新的散列值 bucket = hash(ITEM_key(it), it->nkey, 0) & hashmask(hashpower); it->h_next = primary_hashtable[bucket]; primary_hashtable[bucket] = it; //擴展後變爲新的了 } old_hashtable[expand_bucket] = NULL; expand_bucket++; //對於其他的元素的遷移會在用戶用戶請求的時候進行移動,這是把時間消耗分散的延遲處理方式,當元素遷移完成後, //就會釋放舊的hashtable佔用的資源free if (expand_bucket == hashsize(hashpower - 1)) { expanding = false; free(old_hashtable); if (settings.verbose > 1) fprintf(stderr, "Hash table expansion done/n"); } } } |
(6). int assoc_insert()
//將item加入到hashtable中 int assoc_insert(item *it) { uint32_t hv; unsigned int oldbucket; //驗證item的key不在hashtable中 assert(assoc_find(ITEM_key(it), it->nkey) == 0); /* shouldn't have duplicately named things defined */ hv = hash(ITEM_key(it), it->nkey, 0); //根據hash值和掩碼計算hashtable的下標 //在擴展中還是使用舊的,因爲擴展的時候做了hashpower++操作,如果下標值大於已經移走的數目,那麼它必沒有被移走 if (expanding &&(oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket) { it->h_next = old_hashtable[oldbucket]; old_hashtable[oldbucket] = it; } else { //會有多個映射到同一個上面,比如第一個item進來,它的it->h_next=NULL,下一步又給它賦值it,第二個item進來,它的h_next就是上一個的item地址值了, //然後又給這個桶賦值爲它自己,以此類推:第三個item->h_next=第二個item 第二個item->h_next=第一個item 當前的是第三個item的地址 it->h_next = primary_hashtable[hv & hashmask(hashpower)]; primary_hashtable[hv & hashmask(hashpower)] = it; } hash_items++; //如果當前不是處於擴展狀態,那麼就檢查hashtable中保存的item數是否超過其大小的1.5倍,如果是就進行2倍的容量擴展assoc_expand() if (!expanding && hash_items > (hashsize(hashpower) * 3) / 2) { assoc_expand(); } MEMCACHED_ASSOC_INSERT(ITEM_key(it), hash_items); return 1; } |
(7). void assoc_delete()
//從hashtable中刪除對應key的item void assoc_delete(const char *key, const size_t nkey) { item **before = _hashitem_before(key, nkey); if (*before) { item *nxt; hash_items--; /* The DTrace probe cannot be triggered as the last instruction * due to possible tail-optimization by the compiler */ MEMCACHED_ASSOC_DELETE(key, hash_items); //修改item的h_next指針,從鏈表中刪除該元素 nxt = (*before)->h_next; (*before)->h_next = 0; /* probably pointless, but whatever. */ *before = nxt; return; } /* Note: we never actually get here. the callers don't delete things they can't find. */ assert(*before != 0); } |