Linux內存分配alloc_page和__get_free_page詳註(夥伴管理系統Buddy)

alloc_page和__get_free_page都是從Buddy分配頁面,只是最終返回值類型不同而已,前者返回page指針,後者返回該page所在的虛擬地址。

兩者最終都會調用到核心函數__alloc_pages_nodemask,下面詳述該函數的處理流程。


struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
			struct zonelist *zonelist, nodemask_t *nodemask)
{
	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
	struct zone *preferred_zone;
	struct page *page;
	int migratetype = allocflags_to_migratetype(gfp_mask);//由於gfp flag和migrate type不是一一對應的關係,在此進行轉換

	gfp_mask &= gfp_allowed_mask;

	lockdep_trace_alloc(gfp_mask);

	might_sleep_if(gfp_mask & __GFP_WAIT);//如果此次內存分配可以等待(睡眠),那麼再深入判斷此task是否可以被調度,如果是將主動schedule

	if (should_fail_alloc_page(gfp_mask, order))//打開CONFIG_FAIL_PAGE_ALLOC調試配置選項時,爲分配失敗調試做準備
		return NULL;

	/*
	 * Check the zones suitable for the gfp_mask contain at least one
	 * valid zone. It's possible to have an empty zonelist as a result
	 * of GFP_THISNODE and a memoryless node
	 */
	if (unlikely(!zonelist->_zonerefs->zone))
		return NULL;

	get_mems_allowed();//鎖定分配策略,防止被修改
	/* The preferred zone is used for statistics later */
	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);//搜索可用的zone保存在preferred_zone
	if (!preferred_zone) {//如果沒有可用的zone,釋放分配策略,返回
		put_mems_allowed();
		return NULL;
	}

	/* First allocation attempt */
	//快速路徑分配,指定了cpu親和性和選擇高水線區,check zonelist,找到合適的zone,check水線值,
	//如果不滿足水線值要求,啓動回收機制,最後調用它再buffered_rmqueue在該zone分配內存
	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
			preferred_zone, migratetype);
	if (unlikely(!page))//上面分配失敗,開始從slowpath分配,會啓動回收機制,並且降低水線值,再次調用快速路徑分配
		page = __alloc_pages_slowpath(gfp_mask, order,
				zonelist, high_zoneidx, nodemask,
				preferred_zone, migratetype);
	put_mems_allowed();//釋放策略

	trace_mm_page_alloc(page, order, gfp_mask, migratetype);//調試使用
	return page;
}


/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
		struct zone *preferred_zone, int migratetype)
{
	struct zoneref *z;
	struct page *page = NULL;
	int classzone_idx;
	struct zone *zone;
	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
	int zlc_active = 0;		/* set if using zonelist_cache */
	int did_zlc_setup = 0;		/* just call zlc_setup() one time */

	classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
	/*
	 * Scan zonelist, looking for a zone with enough free.
	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
	 */
	for_each_zone_zonelist_nodemask(zone, z, zonelist,//遍歷zonelist
						high_zoneidx, nodemask) {
		if (NUMA_BUILD && zlc_active &&
			!zlc_zone_worth_trying(zonelist, z, allowednodes))//check zonelist cache中是否有符合的zone
				continue;
		if ((alloc_flags & ALLOC_CPUSET) &&
			!cpuset_zone_allowed_softwall(zone, gfp_mask))//check cpuset和allowed node,函數頭註釋很清楚
				goto try_next_zone;//不滿足要求,check下一個zone

		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {//需要check水線,否則直接跳到真正的分配動作
			unsigned long mark;
			int ret;

			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
			if (zone_watermark_ok(zone, order, mark,//check該zone水線是否滿足要求,水線值根據分配flag ALLOC_HIGH和ALLOC_HARDER,
			//有不同的計算方式,滿足跳到真正分配動作,否則繼續check
				    classzone_idx, alloc_flags))
				goto try_this_zone;

			if (zone_reclaim_mode == 0)//如果本zone不允許回收,更新zone list cache爲full,爲下一次check節省時間
				goto this_zone_full;

			ret = zone_reclaim(zone, gfp_mask, order);//啓動回收機制,如果不可以wait,返回ZONE_RECLAIM_NOSCAN,
			//否則在local zone或沒有關聯到其他processor的zone,調用__zone_reclaim進行回收
			switch (ret) {
			case ZONE_RECLAIM_NOSCAN://沒有scan,直接check next zone
				/* did not scan */
				goto try_next_zone;
			case ZONE_RECLAIM_FULL://沒有分配空間
				/* scanned but unreclaimable */
				goto this_zone_full;
			default:
				/* did we reclaim enough */
				if (!zone_watermark_ok(zone, order, mark,//成功進行了回收,check水線是否滿足要求
						classzone_idx, alloc_flags))
					goto this_zone_full;
			}
		}

try_this_zone:
		page = buffered_rmqueue(preferred_zone, zone, order,//各種情況check完畢,在本zone進行真正的內存分配動作
						gfp_mask, migratetype);
		if (page)
			break;
this_zone_full:
		if (NUMA_BUILD)
			zlc_mark_zone_full(zonelist, z);
try_next_zone:
		if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
			/*
			 * we do zlc_setup after the first zone is tried but only
			 * if there are multiple nodes make it worthwhile
			 */
			allowednodes = zlc_setup(zonelist, alloc_flags);//更新zone list cache
			zlc_active = 1;
			did_zlc_setup = 1;
		}
	}

	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
		/* Disable zlc cache for second zonelist scan */
		zlc_active = 0;
		goto zonelist_scan;
	}
	return page;
}

/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
			struct zone *zone, int order, gfp_t gfp_flags,
			int migratetype)
{
	unsigned long flags;
	struct page *page;
	int cold = !!(gfp_flags & __GFP_COLD);

again:
	if (likely(order == 0)) {//如果分配單頁,直接從pcp搜索
		struct per_cpu_pages *pcp;
		struct list_head *list;

		local_irq_save(flags);//Disable and save irq state
		pcp = &this_cpu_ptr(zone->pageset)->pcp//取得pcp指針
		list = &pcp->lists[migratetype];//取得對應的migrate type pcp list
		if (list_empty(list)) {//如果該list爲空
			pcp->count += rmqueue_bulk(zone, 0,//從buddy的free_area釋放batch個頁面到pcp
					pcp->batch, list,
					migratetype, cold);
			if (unlikely(list_empty(list)))
				goto failed;
		}

		if (cold)//分配冷頁,不被cache的頁,比如用於DMA
			page = list_entry(list->prev, struct page, lru);//分配冷頁,即從鏈表尾開始查找
		else//分配熱頁,被cache的頁,提高效率
			page = list_entry(list->next, struct page, lru);//熱頁從鏈表頭開始查找

		list_del(&page->lru);//從lru list刪除該頁
		pcp->count--;//pcp->count值代表本pcp有多少頁
	} else {
		if (unlikely(gfp_flags & __GFP_NOFAIL)) {
			/*
			 * __GFP_NOFAIL is not to be used in new code.
			 *
			 * All __GFP_NOFAIL callers should be fixed so that they
			 * properly detect and handle allocation failures.
			 *
			 * We most definitely don't want callers attempting to
			 * allocate greater than order-1 page units with
			 * __GFP_NOFAIL.
			 */
			WARN_ON_ONCE(order > 1);
		}
		spin_lock_irqsave(&zone->lock, flags);//爲操作buddy上鎖
		page = __rmqueue(zone, order, migratetype);//真正從buddy的free_area鏈表分配內存,分兩種情況
		//1. __rmqueue_smallest():如果order上對應分配策略要求的migrate type list有空間,從第一滿足
		//的節點上分配內存,並將剩餘的部分add到更小的order鏈表上
		//2. __rmqueue_fallback():如果對應的migrate type list上沒有空間,fallback到其他的type list上,
		//釋放一定空間到本migratte type list上,fallback有相應的sequence,再進行和__rmqueue_smallest類似的分配、合併動作
		spin_unlock(&zone->lock);
		if (!page)
			goto failed;
		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
	}

	__count_zone_vm_events(PGALLOC, zone, 1 << order);//更新本cpu vm event信息
	zone_statistics(preferred_zone, zone);//更新zone相關信息
	local_irq_restore(flags);//enable and restore irq

	VM_BUG_ON(bad_range(zone, page));
	if (prep_new_page(page, order, gfp_flags))//如果本頁已經本映射,重新分配
		goto again;
	return page;

failed:
	local_irq_restore(flags);
	return NULL;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章