以32位系統爲例
內存分配流程調用流程:
alloc_pages()
-->alloc_pages_node()
-->__alloc_pages_node()
-->__alloc_pages()
-->__alloc_pages_nodemask()
-->get_page_from_freelist()
在get_page_from_freelist()中,首先會遍歷當前zone,按照HIGHMEM->NORMAL的方向進行遍歷,判斷當前zone是否能夠進行內存分配的條件是首先判斷free memory是否滿足low water mark水位值,如果不滿足則進行一次快速的內存回收操作,然後再次檢測是否滿足low water mark,如果還是不能滿足,相同步驟遍歷下一個zone,如果兩個zone都不滿足,get_page_from_freelist()函數返回NULL。
快速內存回收機制:
node_reclaim()
-->__node_reclaim()-----次處指定每輪進行回收的頁面最大值爲取需要回收的頁面數和32的最大值,快速回收不能進行unmap,writeback操作,回收priority爲4,即最多嘗試調用shrink_node進行回收的次數爲priority值,直到回收到的頁數達到需要分配的內存頁數或者完成4次循環爲止,也就是最多能夠回收128頁
static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
const unsigned long nr_pages = 1 << order;
………………
struct scan_control sc = {//內存回收的條件
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = memalloc_noio_flags(gfp_mask),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
};
……………………
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
shrink_node(pgdat, &sc);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);//每一輪是否滿足條件
}
}
-->shrink_node()-----這裏會對系統中存在的每一個memcg對應的node進行一次內存回收操作,然後更新這一次vmpressure掃描和回收的值,直到掃描完所有的memcg或者回收的頁面數到足夠的頁面,在完成所有的memcg的掃描或者回收到最多32頁後會調用vmpressure函數根據這一輪內存回收掃描的總頁數以及回收到的頁數來計算當前內存的壓力值,再根據掃描的頁數是否大於512個,決定是否將壓力傳到native進程lmkd確定是否啓動lmkd進行進程清理操作
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
do {
……………………………………
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
if (memcg)
shrink_slab(sc->gfp_mask, pgdat->node_id,
memcg, sc->nr_scanned - scanned,
lru_pages);//這裏會叫lowmemorykiller起來
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);//更新回收時的掃描和已回收的頁面數
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;//回收的頁面數達到標準則跳出循環
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
if (global_reclaim(sc))
shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
sc->nr_scanned - nr_scanned,
node_lru_pages);//這裏當memcg不存在時會叫起來lowmemorykiller
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
/* Record the subtree's reclaim efficiency */
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);//可能會叫起來lmkd
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
}
-->shrink_node_memcg()-----調用get_scan_count確定匿名頁和文件頁lru掃描的頁面數,按照計算好的頁數掃描各lru鏈表調用shrink_list進行內存回收,每次掃描32個頁面。
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *lru_pages)
{
get_scan_count(lruvec, memcg, sc, nr, lru_pages);
…………………………………………
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);//每次掃描32個頁面
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, sc);
}
}
……………………………………………
}
-->shrink_list()-----如果是活躍lru會進行判斷,當前active lru的頁數和inactive lru的比例關係進行判斷是否需要掃描活躍,如果是非活躍lru,則直接進行掃描
-->shrink_active_list()----分成3個臨時鏈表,從lru上面分離下來的存放在l_hold,要放到inactive lru的存放在l_inactive,而要放回active lru的存放在l_active,函數只會將被引用到的可執行文件頁放回到活躍lru,其他的全部移動到非活躍lru
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enum lru_list lru)
{
………………
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);//分離頁面到l_hode
………………
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);//可執行文件頁放回active
continue;
}
}
ClearPageActive(page); /* we are de-activating */
list_add(&page->lru, &l_inactive);其餘的全部添加到inactive
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);//添加到active lru
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);//添加到inactive lru
free_hot_cold_page_list(&l_hold, true);//剩餘的free掉
}
-->shrink_inactive_list()在fastpath裏面,這裏會將沒有mapping,非dirty頁面釋放掉,而slowpath裏面例如kswapd或者direct reclaim則可以進行writeback或者unmanned,mapping操作,主要調用shrink_page_list()完成
static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(page_list);//初始化一個page_list
………………
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;//如果不進行unmap,則分離unmapped的頁面
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;//如果不能進行writerback,則分離clean的頁面
………………
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);//分離出指定類型的頁面
………………
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
&nr_dirty, &nr_unqueued_dirty, &nr_congested,
&nr_writeback, &nr_immediate,
false);//對page_list進行掃描,找到適合free的頁面
………………
putback_inactive_pages(lruvec, &page_list);//將pagelist剩餘的頁面按照page類型返回對應的lru
………………
free_hot_cold_page_list(&page_list, true);//free掉剩餘的lru
………………
if (nr_writeback && nr_writeback == nr_taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);//如果writeback page過多,則置位PGDAT_WRITEBACK
………………
if (nr_unqueued_dirty == nr_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);//如果dirty page過多,則置位PGDAT_DIRTY
if (nr_immediate && current_may_throttle())
congestion_wait(BLK_RW_ASYNC, HZ/10);//dirty page過多,則kswapd sleep 100ms
………………
return nr_reclaimed;
}
-->shrink_page_list()對符合條件的頁面進行回收,這裏對快速回收和slowpath的回收條件進行了區分。
static unsigned long shrink_page_list(struct list_head *page_list,
struct pglist_data *pgdat,
struct scan_control *sc,
enum ttu_flags ttu_flags,
unsigned long *ret_nr_dirty,
unsigned long *ret_nr_unqueued_dirty,
unsigned long *ret_nr_congested,
unsigned long *ret_nr_writeback,
unsigned long *ret_nr_immediate,
bool force_reclaim)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
………………
while (!list_empty(page_list)) {//逐頁掃描
………………
if (unlikely(!page_evictable(page)))//如果當前page不可回收
goto cull_mlocked;//直接添加到ret_pages list
if (!sc->may_unmap && page_mapped(page))//如果當前被進程引用,
goto keep_locked;//unlock page,直接添加到ret_pages list
………………
page_check_dirty_writeback(page, &dirty, &writeback);//判斷page的狀態
if (dirty || writeback)
nr_dirty++;//根據page狀態,計數器+1
if (dirty && !writeback)
nr_unqueued_dirty++;
………………
if (PageWriteback(page)) {//如果頁面正在進行writeback
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {//如果當前是kswapd進程,且PGDAT_WRITEBACK被置位,則保留此頁面在inactive lru
nr_immediate++;//如果這個值不爲0,則說明當前正在進行大量會寫操作,kswapd sleep 100ms
goto keep_locked;
/* Case 2 above */
} else if (sane_reclaim(sc) ||
!PageReclaim(page) || !may_enter_fs) {
SetPageReclaim(page);
nr_writeback++;//非kswapd,則只是增加writeback頁面數
goto keep_locked;
/* Case 3 above */
}else {
unlock_page(page);
wait_on_page_writeback(page);
/* then go back and try same page again */
list_add_tail(&page->lru, page_list);
continue;
}
}
………………
if (PageDirty(page)) {//當前頁爲dirtypage
………………
if (page_is_file_cache(page) &&//對於file page,kswapd只會進行批量的回寫,而非kswapd不會進行會寫操作,只會把當前頁面保存在inactive lru
(!current_is_kswapd() ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
* except we already have the page isolated
* and know it's dirty
*/
inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
SetPageReclaim(page);
goto keep_locked;
}
………………
try_to_unmap_flush_dirty();
switch (pageout(page, mapping, sc)) {//對於匿名頁,則將其寫入交換分區,根據writeback操作,對page進行釋放還是保存在inactive&active lru
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
if (PageWriteback(page))
goto keep;
if (PageDirty(page))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
if (!trylock_page(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
mapping = page_mapping(page);
case PAGE_CLEAN:
; /* try to free the page below */
}
}
………………
free_it://將page添加到free_pages list
if (ret == SWAP_LZFREE)
count_vm_event(PGLAZYFREED);
nr_reclaimed++;
/*
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
list_add(&page->lru, &free_pages);
continue;
cull_mlocked://將page添加到ret_pages list
if (PageSwapCache(page))
try_to_free_swap(page);//匿名頁就free掉swap空間,page被放回內存
unlock_page(page);
list_add(&page->lru, &ret_pages);
continue;
activate_locked://設置page爲active
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && mem_cgroup_swap_full(page))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
pgactivate++;
keep_locked://unlock page
unlock_page(page);
keep://將page返回ret_page list
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}//end while loop
………………
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
free_hot_cold_page_list(&free_pages, true);//free掉free_page list的page
list_splice(&ret_pages, page_list);//將剩餘的ret_page list拷貝到page_list中
count_vm_events(PGACTIVATE, pgactivate);
}
slowpath:
在執行內存分配的路徑上:
__alloc_pages_nodemask()
-->get_page_from_freelist()-----這裏如果無法分配到內存或者執行fast path仍然無法回收到內存則會進入下一步:slow path
-->__alloc_pages_slowpath()-----內存回收的slowpath,主要的內存回收操作函數
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
……………………
alloc_flags = gfp_to_alloc_flags(gfp_mask);//此時alloc_flag變爲ALLOC_WMARK_MIN
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);//可以進行kswapd內存回收,則啓動kswapd
……………………
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);//kswapd完後,看能否分配到內存
……………………
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);//如果還是分配不到內存,則進行直接回收
……………………
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);//直接回收還是分配不到內存,則進行內存規整
}
kswapd:
kswapd實際調用balance_pgdat()來進行內存回收:
-->balance_pgdat()首先設定回收的參數,然後按照priority的值設定循環回收的次數,最後調用kswapd_shrink_node()完成內存回收
static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.priority = DEF_PRIORITY,//priority爲12
.may_writepage = !laptop_mode,//可以進行writeback
.may_unmap = 1,//可以unmap操作
.may_swap = 1,//可以swap
};
………………
do {
………………
for (i = classzone_idx; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
if (zone_balanced(zone, sc.order, classzone_idx))
goto out;//判斷當前water Mark是否滿足low watermark的水準,滿足則說明能夠進行內存分配,不進行內存回收,直接退出
}
………………
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
sc.gfp_mask, &nr_soft_scanned);//此處也會進行內存回收,調用mem_cgroup_shrink_node()完成,最多循環100次
………………
if (kswapd_shrink_node(pgdat, &sc))//kswapd主功能函數
raise_priority = false;
………………
if (raise_priority || !nr_reclaimed)//回收不到,則加大掃描粒度
sc.priority--;
} while (sc.priority >= 1);
}
-->kswapd_shrink_node()-----確定每一輪要回收的頁面數最大值爲high watermark,然後調用shrink_node()進行內存回收
static bool kswapd_shrink_node(pg_data_t *pgdat,
struct scan_control *sc)
{
………………
sc->nr_to_reclaim = 0;
for (z = 0; z <= sc->reclaim_idx; z++) {
zone = pgdat->node_zones + z;
if (!managed_zone(zone))
continue;
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);//設置每輪迴收的最大數目
}
shrink_node(pgdat, sc);//內存回收主函數
………………
}
-->shrink_node()執行步驟和fast path一致
不再一一贅述。
direct reclaim:
__alloc_pages_direct_reclaim()進行一輪的直接回收後,再嘗試分配內存。
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)
{
………………
*did_some_progress = __perform_reclaim(gfp_mask, order, ac);-----直接回收主函數
………………
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);-----嘗試回收內存
}
__perform_reclaim()->try_to_free_pages()設置直接回收條件,然後調用do_try_to_free_pages()完成直接回收:
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,//每輪迴收內存頁數爲32
.gfp_mask = memalloc_noio_flags(gfp_mask),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
.priority = DEF_PRIORITY,//priority爲12
.may_writepage = !laptop_mode,//允許writeback/unmap/swap操作
.may_unmap = 1,
.may_swap = 1,
};
………………
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);//完成直接回收
………………
}
-->do_try_to_free_pages()-----主要先update當前內存壓力值爲critical,然後再進行調用shrink_zones()進行內存回收:
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
………………
do {
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);//update當前壓力爲critical
sc->nr_scanned = 0;
shrink_zones(zonelist, sc);//遍歷zone,回收內存
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
break;//會收到了足夠內存,則跳出循環
………………
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {//當scan的頁面多餘48頁時會進行writeback操作
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
} while (--sc->priority >= 0);
}
shrink_zones()
static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
for_each_zone_zonelist_nodemask(zone, z, zonelist,
sc->reclaim_idx, sc->nodemask) {---遍歷每個zone
………………
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
sc->order, sc->gfp_mask,
&nr_soft_scanned);和之前的函數一致
………………
shrink_node(zone->zone_pgdat, sc);-----和之前的shrink_node流程一致
}
}
下面對fast path,kswapd以及direct reclaim區別進行總結:
1. fast path設置priority值每減小1,回收的內存數最大爲32,priority的值爲4,即最大能回收128個,kswapd priority每減少1,需要回收的頁數爲每個zone的high watermark值,priority的值爲12,direct reclaimpriority每減少1,需要回收的頁數爲32,priority的值爲12,即最大能回收的頁數爲384。
2. fast path不會進行writeback,unmap操作,kswapd和direct reclaim會。
3. kswapd會進行批量的writeback操作,direct reclaim在每一輪掃描的頁面大於48時會喚醒flush線程進行會寫操作,fast path不會進行writeback操作。
4. 系統在內存回收過程中在kswapd或者直接回收完成後都會嘗試get_page_from_freelist()分配內存,這時如果還是沒有滿足watermark min,則會啓動fast path對之前啓動清理進程釋放的大量的unmap,clean的頁面進行回收,回收完成後再次判斷,發現watermark min還是低於watermark min,則啓動direct reclaim。這也就是低於watermark low,啓動kswapd,低於watermark min,啓動direct reclaim的由來。
5. 另由於最近memcg打開,如果系統當前memcg個數增多,也會嚴重增加kswapd的運行時間。