轉載：Linux夥伴系統(三)--分配頁

轉載自： http://blog.csdn.net/vanbreaker/article/details/7621289

前面已經介紹了夥伴系統的原理和Linux夥伴系統的數據結構，現在來看夥伴系統是如何來分配頁面的。實際上，夥伴系統分配頁面的算法並不複雜，但是由於考慮到分配內存時要儘量減少碎片的產生(涉及遷移機制)以及當內存不足時需要採取各種更爲積極的手段，使得內核分配頁面的相關函數完整地分析起來比較複雜龐大。在這裏，我們只關注分配時最一般的情況，而其他情況的處理在以後單獨拿出來討論。

我們從__alloc_pages_nodemask()這個函數開始分析，所有的分配頁面的函數最終都會落到這個函數上面，它是夥伴系統的入口。

[cpp] view plain copy

struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
/*根據gfp_mask確定分配頁所處的管理區*/
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page;
/*根據gfp_mask得到遷移類分配頁的型*/
int migratetype = allocflags_to_migratetype(gfp_mask);
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT);
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
/* The preferred zone is used for statistics later */
/*從zonelist中找到zone_idx與high_zoneidx相同的管理區，也就是之前認定的管理區*/
first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
if (!preferred_zone)
return NULL;
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
preferred_zone, migratetype);
if (unlikely(!page))
/*第一次分配失敗的話則會用通過一條低速路徑來進行第二次分配，包括喚醒頁換出守護進程等等*/
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
}

首先要做的就是找到指定的分配管理區，管理區的編號保存在high_zoneidx中
然後就是嘗試第一次分配，流程是從指定的管理區開始掃描管理區-->找到充足的管理區-->從指定的遷移類型鏈表中分配內存-->如果在指定遷移類型中找不到則到其他的遷移類型中去尋找
如果第二步在各個區域都找不到可以滿足分配的內存了，那麼說明管理區的內存已經確實不夠了，於是開始啓用一條慢速的途徑來分配，包括嘗試去換出一些不經常使用的頁等等，內核會在這次分配中表現得更爲積極，其中的細節涉及到了其他一些複雜的東西，以後再做分析

[cpp] view plain copy

static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
/*獲取管理區的編號*/
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
/*從認定的管理區開始遍歷，直到找到一個擁有足夠空間的管理區，
例如，如果high_zoneidx對應的ZONE_HIGHMEM，則遍歷順序爲HIGHMEM-->NORMAL-->DMA，
如果high_zoneidx對應ZONE_NORMAL，則遍歷順序爲NORMAL-->DMA*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
/*檢查給定的內存域是否屬於該進程允許運行的CPU*/
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
goto try_next_zone;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
/*通過alloc_flags來確定是使用何種水印，pages_min?pages_low?pages_high?
選擇了一種水印，就要求分配後的空閒不低於該水印才能進行分配*/
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/*如果管理區的水位線處於正常水平，則在該管理區進行分配*/
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
if (zone_reclaim_mode == 0)
goto this_zone_full;
/*下面這部分都是針對NUMA架構的申請頁面回收*/
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:/*沒有進行回收*/
/* did not scan */
goto try_next_zone;
case ZONE_RECLAIM_FULL: /*沒有找到可回收的頁面*/
/* scanned but unreclaimable */
goto this_zone_full;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto this_zone_full;
}
}
try_this_zone:/*分配2^order個頁*/
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
try_next_zone:
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup after the first zone is tried but only
* if there are multiple nodes make it worthwhile
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
return page;
}

從指定的管理區開始按照zonelist中定義的順序來遍歷管理區
如果該管理區的水位線正常，則調用buffered_rmqueue()在該管理區中分配
如果管理區的水位線過低，則在NUMA架構下會申請頁面回收

[cpp] view plain copy

static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
int cold = !!(gfp_flags & __GFP_COLD);
int cpu;
again:
cpu = get_cpu();
if (likely(order == 0)) {/*order爲0，即要求分配一個頁*/
struct per_cpu_pages *pcp;
struct list_head *list;
pcp = &zone_pcp(zone, cpu)->pcp;/*獲取本地CPU對應的pcp*/
list = &pcp->lists[migratetype];/*獲取和遷移類型對應的鏈表*/
local_irq_save(flags);
/*如果鏈表爲空，則表示沒有可分配的頁，需要從夥伴系統中分配2^batch個頁給list*/
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
goto failed;
}
if (cold)/*如果是需要冷頁，則從鏈表的尾部獲取*/
page = list_entry(list->prev, struct page, lru);
else /*如果是需要熱頁，則從鏈表的頭部獲取*/
page = list_entry(list->next, struct page, lru);
list_del(&page->lru);
pcp->count--;
} else {
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
/*從管理區的夥伴系統中選擇合適的內存塊進行分配*/
page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
put_cpu();
VM_BUG_ON(bad_range(zone, page));
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
failed:
local_irq_restore(flags);
put_cpu();
return NULL;
}

該函數分兩種情況進行處理，一種是隻要求分配單個頁框，另一種是要求分配多個連續頁框
對於單個頁面，內核選擇從每CPU頁框高速緩存中分配，它的核心描述結構也是MIGRATE_TYPES個鏈表，只不過鏈表中的元素都是單個頁。這些頁分爲熱頁和冷頁，所謂熱頁就是還處在CPU高速緩存中的頁，相反，冷頁就是不存在於高速緩存中的頁。對於單個頁框的申請，分配熱頁可以提高效率。需要注意的是，越靠近鏈表頭的頁越熱，越靠近鏈表尾的頁越冷，因爲每次釋放單個頁框的時候，頁框是插入到鏈表的頭部的，也就是說靠近頭部的頁框是最近才釋放的，因此最有可能存在於高速緩存當中
對於連續的頁框分配，通過調用__rmqueue()來完成分配

[cpp] view plain copy

static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
{
struct page *page;
retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
/*如果分配失敗並且遷移類型不是MIGRATE_RESERVE(如果是MIGRATE_RESERVE，
則表明已經沒有其他的遷移類型可供選擇了)*/
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
* is used because __rmqueue_smallest is an inline function
* and we want just one call site
*/
if (!page) {
migratetype = MIGRATE_RESERVE;
goto retry_reserve;
}
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}

首先按照指定的遷移類型，調用__rmqueue_smallest()來分配對應的內存塊,該函數是夥伴系統的算法體現
如果分配失敗，則說明指定的遷移類型中沒有充足的內存來滿足分配，這時就要按fallbacks中定義的順序從其他的遷移鏈表中尋找了，__rmqueue_fallback()函數較爲複雜，體現了利用遷移類型來避免碎片的思想，後面單獨拿出來分析

[cpp] view plain copy

static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area * area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
/*獲取和現在的階數對應的free_area*/
area = &(zone->free_area[current_order]);
/*和遷移類型對應的free_list爲空則不執行下面的內容*/
if (list_empty(&area->free_list[migratetype]))
continue;
/*得到滿足要求的頁塊中的第一個頁描述符*/
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
list_del(&page->lru);
rmv_page_order(page);/*將page的private域設爲0*/
area->nr_free--; /*內存塊數減1*/
/*進行拆分(在current_order>order的情況下)*/
expand(zone, page, order, current_order, area, migratetype);
return page;
}
return NULL;
}

[cpp] view plain copy

static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
int migratetype)
{
unsigned long size = 1 << high;/*order爲high的頁塊對應的頁框數*/
/*申請的order爲low,實際分配的塊對應的order爲high
如果high大於low則要將大塊進行拆分，並且將拆分後的夥伴塊添加到下一級order的塊鏈表中去*/
while (high > low) {
area--;/*area減1得到下一級order對應的area*/
high--;/*high減1表明進行了一次拆分*/
size >>= 1;/*拆分一次size就要除以2*/
VM_BUG_ON(bad_range(zone, &page[size]));
/*通過size來定位拆分後的夥伴塊的起始頁框描述符，
並將其作爲第一個塊添加到下一級order的塊鏈表中*/
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;/*該order區域的塊數加1*/
set_page_order(&page[size], high);/*設定private域爲high*/
}
}

只需要注意一點，一個塊的定位可以由塊首的起始頁對應的描述符和order(size)來定位，因此只需要將一個塊的第一個頁描述符鏈入相應的鏈表就可以了。

轉載：Linux夥伴系統(三)--分配頁

轉載：Linux夥伴系統(三)--分配頁

轉載： Linux夥伴系統(一)--夥伴系統的概述

一些重要struct

轉載：地址空間分佈

轉載：《深入理解LINUX內存管理》學習筆記

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結