先說下快速分配內存頁的參數:
gfp_mask 進入快速分配時,加上了__GFP_HARDWALL 這表示再分配時要加大分配力度;
nodemask 表示節點的mask,就是是否能在該節點上分配內存,這是個bit位數組;
order 是分配的階;
zonelist 是當perferred_zone上沒有合適的頁可以分配時,就要按zonelist中的順序掃描該zonelist中備用zone列表,一個個的試用;
high_zoneidx是表示該分配時,所能分配的最高zone,一般從high --》 normal --》dma 內存越來越昂貴,所以一般從high到dma分配依次分配;
alloc_flags 是分配內存是的標識;
preferred_zone 表示從high_zoneidx後找到的合適的zone,一般會從該zone分配;分配失敗的話,就會在zonelist再找一個preferred_zone = 合適的zone;
migratetype是遷移類型,在zone->free_area.free_list[XXX] 作爲分配下標使用,這個是用來反碎片化的,修改了以前的free_area結構體,在該結構體中再添加了一個數組,該數組以遷移類型爲下標,每個數組元素都掛了對應遷移類型的頁鏈表;
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);//zone的id
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {//這個宏是從zonelist->_zonerefs數組中得到合適的zone,具體解釋看後面
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))//z->zone所在的節點不允許分配或者該zone已經飽滿了,都跳過該z
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))//開啓了檢查內存節點是否在指定CPU集合,並且該zone不被允許在該CPU上分配內存,跳過;
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
* limit, such that no single zone holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the zone's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* This may look like it could increase pressure on
* lower zones by failing allocations in higher zones
* before they are full. But the pages that do spill
* over are limited as the lower zones are protected
* by this very same mechanism. It should not become
* a practical burden to them.
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
* (ALLOC_WMARK_LOW unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))//判斷該zone上的髒頁是否超過了limit
goto this_zone_full;//髒頁超過了限制,跳轉到最後設置該zone已經飽滿,這樣可以平衡髒頁分配到各個zone上
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))//檢查該zone是否有足夠的頁來分配,具體分析見後面
goto try_this_zone;
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
// 上面已經用zone_watermark_ok()測試了該zone是否能分配頁,如果分配不了,而又不能回收,或者該zone不在回收zone範圍,所以就只好設置飽滿了,防止下一次再掃描該zone
if (zone_reclaim_mode == 0 ||
!zone_allows_reclaim(preferred_zone, zone))
goto this_zone_full;
/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*///因爲上面已經設置過zlc_active等變量,所以要再次掃描下,不行就跳過該zone
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
ret = zone_reclaim(zone, gfp_mask, order);//運行到這裏表明該zone可以回收頁
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default://上面兩種情況都是沒有回收頁面的,到這裏則表示已經回收了部分頁面
/* did we reclaim enough *///因爲上面回收了部分頁面,所以要再用看看該zone是否可以分配頁了
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
/*
* Failed to reclaim enough to meet watermark.
* Only mark the zone full if checking the min
* watermark or if we failed to reclaim just
* 1<<order pages or else the page allocator
* fastpath will prematurely mark zones full
* when the watermark is between the low and
* min watermarks.
*///沒辦法,確實盡力了
if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
ret == ZONE_RECLAIM_SOME)
goto this_zone_full;
continue;
}
}
try_this_zone://理想情況下,開始分配內存
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full://該zone已經飽滿了,設置該zone
if (IS_ENABLED(CONFIG_NUMA))
zlc_mark_zone_full(zonelist, z);
}//這裏就結束了遍歷zone分配函數的循環了
if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;//再循環一次
}
if (page)
/*
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
* necessary to allocate the page. The expectation is
* that the caller is taking steps that will free more
* memory. The caller should avoid the page being used
* for !PFMEMALLOC purposes.
*///如果是無水印分配得到的頁,表明該zone已經沒有多少內存頁可以被用來分配了,所以要設置pfmemalloc,讓系統回收點內存;
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
return page;
}
這個宏是從zonelist->_zonerefs數組中獲取合適的zone
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
zone是循環體中可以使用的,遍歷zonelist->_zonerefs數組元素,
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
zone; \
z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
zone);
}
struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};
highest_zoneid 是所能接受的最大zone,如果比它還大,那就不行所以z++,直到找到一個不大於(等於是最合適的)的返回
/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
/*
* Find the next suitable zone to use for the allocation.
* Only filter based on nodemask if it's set
*/
if (likely(nodes == NULL))
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
else
while (zonelist_zone_idx(z) > highest_zoneidx || (z->zone && !zref_in_nodemask(z, nodes)))
z++;
*zone = zonelist_zone(z);
return z;
}
要理解zlc_zone_worth_trying()函數,要先看幾個結構體
struct zonelist {
struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
#ifdef CONFIG_NUMA
struct zonelist_cache zlcache; // optional ...
#endif
};
struct zonelist_cache {
unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
unsigned long last_full_zap; /* when last zap'd (jiffies) */
};
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
int n; /* node that zone *z is on */
zlc = zonelist->zlcache_ptr;//上面結構體說的很明白 zlcache_ptr 是 zlcache的地址
if (!zlc)//表示 zlcache不存在,內存是UMA模式
return 1;
i = z - zonelist->_zonerefs;// 這是 求z在_zonerfs是第幾個元素
n = zlc->z_to_n[i];//根據i可以得到該zone所在的節點編號nid
/* This zone is worth trying if it is allowed but not full */
return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);//判斷該nid是否合法,該zone是否已經達到飽滿了;
}
/*
* Return true if free pages are above 'mark'. This takes into account the order * of the allocation.
*/
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
long lowmem_reserve = z->lowmem_reserve[classzone_idx];//緊急情況下才能分配的頁數
int o;
long free_cma = 0;
free_pages -= (1 << order) - 1;//減去要分配的頁數
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
if (free_pages - free_cma <= min + lowmem_reserve)//min + lowmem_reserve是一個界限,小於則不能在該zone上分配
return false;
for (o = 0; o < order; o++) {//循環去掉比需要分配的order介還小的階上的free page
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
/* Require fewer higher order pages to be free */
min >>= 1;//高階頁可以少些
if (free_pages <= min)
return false;
}
return true;
}