以32位系统为例
内存分配流程调用流程:
alloc_pages()
-->alloc_pages_node()
-->__alloc_pages_node()
-->__alloc_pages()
-->__alloc_pages_nodemask()
-->get_page_from_freelist()
在get_page_from_freelist()中,首先会遍历当前zone,按照HIGHMEM->NORMAL的方向进行遍历,判断当前zone是否能够进行内存分配的条件是首先判断free memory是否满足low water mark水位值,如果不满足则进行一次快速的内存回收操作,然后再次检测是否满足low water mark,如果还是不能满足,相同步骤遍历下一个zone,如果两个zone都不满足,get_page_from_freelist()函数返回NULL。
快速内存回收机制:
node_reclaim()
-->__node_reclaim()-----次处指定每轮进行回收的页面最大值为取需要回收的页面数和32的最大值,快速回收不能进行unmap,writeback操作,回收priority为4,即最多尝试调用shrink_node进行回收的次数为priority值,直到回收到的页数达到需要分配的内存页数或者完成4次循环为止,也就是最多能够回收128页
static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
{
const unsigned long nr_pages = 1 << order;
………………
struct scan_control sc = {//内存回收的条件
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = memalloc_noio_flags(gfp_mask),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
};
……………………
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do {
shrink_node(pgdat, &sc);
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);//每一轮是否满足条件
}
}
-->shrink_node()-----这里会对系统中存在的每一个memcg对应的node进行一次内存回收操作,然后更新这一次vmpressure扫描和回收的值,直到扫描完所有的memcg或者回收的页面数到足够的页面,在完成所有的memcg的扫描或者回收到最多32页后会调用vmpressure函数根据这一轮内存回收扫描的总页数以及回收到的页数来计算当前内存的压力值,再根据扫描的页数是否大于512个,决定是否将压力传到native进程lmkd确定是否启动lmkd进行进程清理操作
static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
do {
……………………………………
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
if (memcg)
shrink_slab(sc->gfp_mask, pgdat->node_id,
memcg, sc->nr_scanned - scanned,
lru_pages);//这里会叫lowmemorykiller起来
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);//更新回收时的扫描和已回收的页面数
if (!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;//回收的页面数达到标准则跳出循环
}
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
if (global_reclaim(sc))
shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
sc->nr_scanned - nr_scanned,
node_lru_pages);//这里当memcg不存在时会叫起来lowmemorykiller
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
/* Record the subtree's reclaim efficiency */
vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);//可能会叫起来lmkd
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
}
-->shrink_node_memcg()-----调用get_scan_count确定匿名页和文件页lru扫描的页面数,按照计算好的页数扫描各lru链表调用shrink_list进行内存回收,每次扫描32个页面。
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *lru_pages)
{
get_scan_count(lruvec, memcg, sc, nr, lru_pages);
…………………………………………
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);//每次扫描32个页面
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, sc);
}
}
……………………………………………
}
-->shrink_list()-----如果是活跃lru会进行判断,当前active lru的页数和inactive lru的比例关系进行判断是否需要扫描活跃,如果是非活跃lru,则直接进行扫描
-->shrink_active_list()----分成3个临时链表,从lru上面分离下来的存放在l_hold,要放到inactive lru的存放在l_inactive,而要放回active lru的存放在l_active,函数只会将被引用到的可执行文件页放回到活跃lru,其他的全部移动到非活跃lru
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enum lru_list lru)
{
………………
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);//分离页面到l_hode
………………
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);//可执行文件页放回active
continue;
}
}
ClearPageActive(page); /* we are de-activating */
list_add(&page->lru, &l_inactive);其余的全部添加到inactive
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);//添加到active lru
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);//添加到inactive lru
free_hot_cold_page_list(&l_hold, true);//剩余的free掉
}
-->shrink_inactive_list()在fastpath里面,这里会将没有mapping,非dirty页面释放掉,而slowpath里面例如kswapd或者direct reclaim则可以进行writeback或者unmanned,mapping操作,主要调用shrink_page_list()完成
static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(page_list);//初始化一个page_list
………………
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;//如果不进行unmap,则分离unmapped的页面
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;//如果不能进行writerback,则分离clean的页面
………………
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);//分离出指定类型的页面
………………
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
&nr_dirty, &nr_unqueued_dirty, &nr_congested,
&nr_writeback, &nr_immediate,
false);//对page_list进行扫描,找到适合free的页面
………………
putback_inactive_pages(lruvec, &page_list);//将pagelist剩余的页面按照page类型返回对应的lru
………………
free_hot_cold_page_list(&page_list, true);//free掉剩余的lru
………………
if (nr_writeback && nr_writeback == nr_taken)
set_bit(PGDAT_WRITEBACK, &pgdat->flags);//如果writeback page过多,则置位PGDAT_WRITEBACK
………………
if (nr_unqueued_dirty == nr_taken)
set_bit(PGDAT_DIRTY, &pgdat->flags);//如果dirty page过多,则置位PGDAT_DIRTY
if (nr_immediate && current_may_throttle())
congestion_wait(BLK_RW_ASYNC, HZ/10);//dirty page过多,则kswapd sleep 100ms
………………
return nr_reclaimed;
}
-->shrink_page_list()对符合条件的页面进行回收,这里对快速回收和slowpath的回收条件进行了区分。
static unsigned long shrink_page_list(struct list_head *page_list,
struct pglist_data *pgdat,
struct scan_control *sc,
enum ttu_flags ttu_flags,
unsigned long *ret_nr_dirty,
unsigned long *ret_nr_unqueued_dirty,
unsigned long *ret_nr_congested,
unsigned long *ret_nr_writeback,
unsigned long *ret_nr_immediate,
bool force_reclaim)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
………………
while (!list_empty(page_list)) {//逐页扫描
………………
if (unlikely(!page_evictable(page)))//如果当前page不可回收
goto cull_mlocked;//直接添加到ret_pages list
if (!sc->may_unmap && page_mapped(page))//如果当前被进程引用,
goto keep_locked;//unlock page,直接添加到ret_pages list
………………
page_check_dirty_writeback(page, &dirty, &writeback);//判断page的状态
if (dirty || writeback)
nr_dirty++;//根据page状态,计数器+1
if (dirty && !writeback)
nr_unqueued_dirty++;
………………
if (PageWriteback(page)) {//如果页面正在进行writeback
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {//如果当前是kswapd进程,且PGDAT_WRITEBACK被置位,则保留此页面在inactive lru
nr_immediate++;//如果这个值不为0,则说明当前正在进行大量会写操作,kswapd sleep 100ms
goto keep_locked;
/* Case 2 above */
} else if (sane_reclaim(sc) ||
!PageReclaim(page) || !may_enter_fs) {
SetPageReclaim(page);
nr_writeback++;//非kswapd,则只是增加writeback页面数
goto keep_locked;
/* Case 3 above */
}else {
unlock_page(page);
wait_on_page_writeback(page);
/* then go back and try same page again */
list_add_tail(&page->lru, page_list);
continue;
}
}
………………
if (PageDirty(page)) {//当前页为dirtypage
………………
if (page_is_file_cache(page) &&//对于file page,kswapd只会进行批量的回写,而非kswapd不会进行会写操作,只会把当前页面保存在inactive lru
(!current_is_kswapd() ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
* except we already have the page isolated
* and know it's dirty
*/
inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
SetPageReclaim(page);
goto keep_locked;
}
………………
try_to_unmap_flush_dirty();
switch (pageout(page, mapping, sc)) {//对于匿名页,则将其写入交换分区,根据writeback操作,对page进行释放还是保存在inactive&active lru
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
if (PageWriteback(page))
goto keep;
if (PageDirty(page))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
*/
if (!trylock_page(page))
goto keep;
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
mapping = page_mapping(page);
case PAGE_CLEAN:
; /* try to free the page below */
}
}
………………
free_it://将page添加到free_pages list
if (ret == SWAP_LZFREE)
count_vm_event(PGLAZYFREED);
nr_reclaimed++;
/*
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
list_add(&page->lru, &free_pages);
continue;
cull_mlocked://将page添加到ret_pages list
if (PageSwapCache(page))
try_to_free_swap(page);//匿名页就free掉swap空间,page被放回内存
unlock_page(page);
list_add(&page->lru, &ret_pages);
continue;
activate_locked://设置page为active
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && mem_cgroup_swap_full(page))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
pgactivate++;
keep_locked://unlock page
unlock_page(page);
keep://将page返回ret_page list
list_add(&page->lru, &ret_pages);
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}//end while loop
………………
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
free_hot_cold_page_list(&free_pages, true);//free掉free_page list的page
list_splice(&ret_pages, page_list);//将剩余的ret_page list拷贝到page_list中
count_vm_events(PGACTIVATE, pgactivate);
}
slowpath:
在执行内存分配的路径上:
__alloc_pages_nodemask()
-->get_page_from_freelist()-----这里如果无法分配到内存或者执行fast path仍然无法回收到内存则会进入下一步:slow path
-->__alloc_pages_slowpath()-----内存回收的slowpath,主要的内存回收操作函数
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
……………………
alloc_flags = gfp_to_alloc_flags(gfp_mask);//此时alloc_flag变为ALLOC_WMARK_MIN
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);//可以进行kswapd内存回收,则启动kswapd
……………………
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);//kswapd完后,看能否分配到内存
……………………
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);//如果还是分配不到内存,则进行直接回收
……………………
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);//直接回收还是分配不到内存,则进行内存规整
}
kswapd:
kswapd实际调用balance_pgdat()来进行内存回收:
-->balance_pgdat()首先设定回收的参数,然后按照priority的值设定循环回收的次数,最后调用kswapd_shrink_node()完成内存回收
static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.priority = DEF_PRIORITY,//priority为12
.may_writepage = !laptop_mode,//可以进行writeback
.may_unmap = 1,//可以unmap操作
.may_swap = 1,//可以swap
};
………………
do {
………………
for (i = classzone_idx; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
if (zone_balanced(zone, sc.order, classzone_idx))
goto out;//判断当前water Mark是否满足low watermark的水准,满足则说明能够进行内存分配,不进行内存回收,直接退出
}
………………
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
sc.gfp_mask, &nr_soft_scanned);//此处也会进行内存回收,调用mem_cgroup_shrink_node()完成,最多循环100次
………………
if (kswapd_shrink_node(pgdat, &sc))//kswapd主功能函数
raise_priority = false;
………………
if (raise_priority || !nr_reclaimed)//回收不到,则加大扫描粒度
sc.priority--;
} while (sc.priority >= 1);
}
-->kswapd_shrink_node()-----确定每一轮要回收的页面数最大值为high watermark,然后调用shrink_node()进行内存回收
static bool kswapd_shrink_node(pg_data_t *pgdat,
struct scan_control *sc)
{
………………
sc->nr_to_reclaim = 0;
for (z = 0; z <= sc->reclaim_idx; z++) {
zone = pgdat->node_zones + z;
if (!managed_zone(zone))
continue;
sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);//设置每轮回收的最大数目
}
shrink_node(pgdat, sc);//内存回收主函数
………………
}
-->shrink_node()执行步骤和fast path一致
不再一一赘述。
direct reclaim:
__alloc_pages_direct_reclaim()进行一轮的直接回收后,再尝试分配内存。
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)
{
………………
*did_some_progress = __perform_reclaim(gfp_mask, order, ac);-----直接回收主函数
………………
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);-----尝试回收内存
}
__perform_reclaim()->try_to_free_pages()设置直接回收条件,然后调用do_try_to_free_pages()完成直接回收:
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,//每轮回收内存页数为32
.gfp_mask = memalloc_noio_flags(gfp_mask),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
.priority = DEF_PRIORITY,//priority为12
.may_writepage = !laptop_mode,//允许writeback/unmap/swap操作
.may_unmap = 1,
.may_swap = 1,
};
………………
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);//完成直接回收
………………
}
-->do_try_to_free_pages()-----主要先update当前内存压力值为critical,然后再进行调用shrink_zones()进行内存回收:
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
………………
do {
vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
sc->priority);//update当前压力为critical
sc->nr_scanned = 0;
shrink_zones(zonelist, sc);//遍历zone,回收内存
total_scanned += sc->nr_scanned;
if (sc->nr_reclaimed >= sc->nr_to_reclaim)
break;//会收到了足够内存,则跳出循环
………………
writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
if (total_scanned > writeback_threshold) {//当scan的页面多余48页时会进行writeback操作
wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
} while (--sc->priority >= 0);
}
shrink_zones()
static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
for_each_zone_zonelist_nodemask(zone, z, zonelist,
sc->reclaim_idx, sc->nodemask) {---遍历每个zone
………………
nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
sc->order, sc->gfp_mask,
&nr_soft_scanned);和之前的函数一致
………………
shrink_node(zone->zone_pgdat, sc);-----和之前的shrink_node流程一致
}
}
下面对fast path,kswapd以及direct reclaim区别进行总结:
1. fast path设置priority值每减小1,回收的内存数最大为32,priority的值为4,即最大能回收128个,kswapd priority每减少1,需要回收的页数为每个zone的high watermark值,priority的值为12,direct reclaimpriority每减少1,需要回收的页数为32,priority的值为12,即最大能回收的页数为384。
2. fast path不会进行writeback,unmap操作,kswapd和direct reclaim会。
3. kswapd会进行批量的writeback操作,direct reclaim在每一轮扫描的页面大于48时会唤醒flush线程进行会写操作,fast path不会进行writeback操作。
4. 系统在内存回收过程中在kswapd或者直接回收完成后都会尝试get_page_from_freelist()分配内存,这时如果还是没有满足watermark min,则会启动fast path对之前启动清理进程释放的大量的unmap,clean的页面进行回收,回收完成后再次判断,发现watermark min还是低于watermark min,则启动direct reclaim。这也就是低于watermark low,启动kswapd,低于watermark min,启动direct reclaim的由来。
5. 另由于最近memcg打开,如果系统当前memcg个数增多,也会严重增加kswapd的运行时间。