日期 | 內核版本 | 架構 | 作者 | GitHub | CSDN |
---|---|---|---|---|---|
2017-07-04 | Linux-4.12 | X86 | lwhuq | LinuxMemoryStudy | Linux內存管理 |
1 zone 結構
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
/*
*頁框換出使用的“水印“。當空閒頁框多於page_max時,這個狀態是理想的。
*當頁框少於page_low時需要觸發kswapd守護進程進行頁面回收。
*當頁框少於page_min說明頁面回收壓力很大,內核必須立即採取各種辦法來緩解壓力。
*/
unsigned long watermark[NR_WMARK];
unsigned long nr_reserved_highatomic;
/*
* We don't know if the memory that we're going to allocate will be
* freeable or/and it will be released eventually, so to avoid totally
* wasting several GB of ram we must reserve some of the lower zone
* memory (otherwise we risk to run OOM on the lower zones despite
* there being tons of freeable ram on the higher zones). This array is
* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
* changes.
*/
/*
* 指明各種內存域在處理內存不足的臨界情況下內存域必須爲上級內存域保留的頁框數目,
* 同時也用於在中斷或臨界區發出的原子內存分配請求(就是禁止阻塞的內存分配請求)。
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat; //指向當前內存域父結點描述符結構
struct per_cpu_pageset __percpu *pageset; //每CPU冷熱頁
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn; //當前內存域第一個頁框的頁框號
/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.
*
* Read access to managed_pages should be safe because it's unsigned
* long. Write access to zone->managed_pages and totalram_pages are
* protected by managed_page_count_lock at runtime. Idealy only
* adjust_managed_page_count() should be used instead of directly
* touching zone->managed_pages and totalram_pages.
*/
unsigned long managed_pages; //當前內存域總頁框數(包含洞)
unsigned long spanned_pages; //當前內存域總頁框數(不包含洞)
unsigned long present_pages; //當前內存域總頁框數(不包含洞)- 保留頁框數
const char *name;
#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */
/* buddy夥伴系統,每個數組元素表示一個固定長度的連續頁框 */
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags; //flags描述內存域的當前狀態
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
#endif
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
1.1 ZONE_PADDING
該結構比較特殊的地方是它由ZONE_PADDING分隔的幾個部分。這個是爲了讓一些有lock要求的數據處於不同的cache line。
zone結構內有些數據是需要鎖的,即它們可能被多個CPU同時訪問。zone結構內有兩鎖,一個是自旋鎖zone->lock,另一個是順序鎖span_seqlock。
多處理系統cache還有一個cache一致性問題。當一個CPU對某個內存的cache line做出修改時,這個內存在別的CPU的cache line上面的數據就變成invalid的了,需要同步update。
當兩個鎖要保護的數據在同一個cache line時,兩個CPU通過zone的兩個鎖去獲得了對zone結構的訪問權限,然後都對數據做修改。當他們要操作的數據在一個cache line裏面的時候,每個CPU的cache line都會有一份該數據的緩存,當每一個CPU對數據做修改時都會導致另一個CPU cache裏面的數據invalid而需要同步update。而zone結構的訪問是非常頻繁的,cache line的同步更新會對性能有很大的影響。
即便只有一個鎖時,要鎖保護的數據也最好是在一個cache line裏面,只有需要同步update的數據只有一個cache line,而如果要鎖保護的數據跨越了cache line的話,需要同步update的工作量就會翻翻。
對這個問題的解決辦法就是通過ZONE_PADDING把不同鎖要保護的數據放在不同的cache line。儘量減少不必要的cache line 同步update。
ZONE_PADDING的定義在include/linux/mmzone.h#L112
#if defined(CONFIG_SMP)
struct zone_padding {
char x[0];
} ____cacheline_internodealigned_in_smp;
#define ZONE_PADDING(name) struct zone_padding name;
#else
#define ZONE_PADDING(name)
#endif
內核還用了____cacheline_internodealigned_in_smp,來實現最優的高速緩存行對其方式,其定義在include/linux/cache.h#L70
#if !defined(____cacheline_internodealigned_in_smp)
#if defined(CONFIG_SMP)
#define ____cacheline_internodealigned_in_smp \
__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT))))
#else
#define ____cacheline_internodealigned_in_smp
#endif
#endif
2 水印
水印的類型zone_watermarks定義在include/linux/mmzone.h#L255
enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
NR_WMARK
};
可以把物理內存想象成一個水池。它有三個水印刻度,當水池的水達到最高水印刻度watermark[WMARK_HIGH],水池水很充足不用擔心沒水的問題。當水池的水少於watermark[WMARK_LOW]時,得趕緊打開水龍頭放點水進來。當水少於watermark[WMARK_MIN]時,開一個水龍頭放水都不管用了,得像更多的辦法再拖個水管過來了。系統進程kswapd就是這個水龍頭,用於頁框交換回收。
#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
2.1 水印初始化
內存域結構內的水印(watermark數組)由setup_per_zone_wmarks初始化,該函數在內核啓動或者min_free_kbytes被修改時期被系統調用。其定義在mm/page_alloc.c#L6945
void setup_per_zone_wmarks(void)
{
mutex_lock(&zonelists_mutex);
__setup_per_zone_wmarks();
mutex_unlock(&zonelists_mutex);
}
- 互斥量保證只時有一個線程可以修改水印參數
- setup_per_zone_wmarks真正實現是在__setup_per_zone_wmarks中。
static void __setup_per_zone_wmarks(void)
{
/*
* min_free_kbytes是全局系統參數,單位是KB
* pages_min單位是頁框大小
*/
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
/* 計算所有非ZONE_HIGHMEM內存域的可用內存頁框總數,不包括洞,也不包括預留內存 */
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->managed_pages;
}
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags); //操作共享資源前先獲得鎖
tmp = (u64)pages_min * zone->managed_pages;
/*
* tmp = tmp/lowmem_pages
* = (u64)pages_min * zone->managed_pages / (zone->managed_pages + otherzone->managed_pages)
* 意思就是把全局系統參數min_free_kbytes按照每個內存域managed_pages頁框數比例分配到每個內存域
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* need highmem pages, so cap pages_min to a small
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
min_pages = zone->managed_pages / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
zone->watermark[WMARK_MIN] = tmp;
}
/*
* Set the kswapd watermarks distance according to the
* scale factor in proportion to available memory, but
* ensure a minimum size on small systems.
*/
tmp = max_t(u64, tmp >> 2,
mult_frac(zone->managed_pages,
watermark_scale_factor, 10000));
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
spin_unlock_irqrestore(&zone->lock, flags);
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
- 對於ZONE_HIGHMEM內存域,水印不是特別重要,把watermark[WMARK_MIN]設置成一個較小的值即可。並限制在32個頁框到128個頁框之間。
#define SWAP_CLUSTER_MAX 32UL
- 對於非ZONE_HIGHMEM內存域,watermark[WMARK_MIN]就是min_free_kbytes按各內存域比例,並把單位轉換成頁框。
- watermark[WMARK_LOW] = watermark[WMARK_MIN]
+ 1/4 max(min_free_kbytes per zone, managed_pages/1000)
- watermark[WMARK_HIGH] = watermark[WMARK_MIN] +1/2 max(min_free_kbytes per zone, managed_pages/1000)
- calculate_totalreserve_pages用於更新每個結點的totalreserve_pages參數。
- setup_zone_migrate_reserve函數被刪除了?
2.2 lowmem_reserve
kernel在分配內存時,如果失敗就會嘗試下一個低級的zone(這裏的低級僅僅指zone內存的位置,實際上低地址zone是更稀缺的資源)。我們可以想像應用進程通過內存映射申請Highmem 並且加mlock分配,如果此時Highmem zone無法滿足分配,則會嘗試從Normal進行分配。這就有一個問題,來自Highmem的請求可能會耗盡Normal zone的內存,而且由於mlock又無法回收,最終的結果就是Normal zone無內存提供給kernel的正常分配,而Highmem有大把的可回收內存無法有效利用。因此針對這個case,使得Normal zone在碰到來自Highmem的分配請求時,可以通過lowmem_reserve聲明:可以使用我的內存,但是必須要保留lowmem_reserve[NORMAL]給我自己使用。同樣當從Normal失敗後,會嘗試從zonelist中的DMA申請分配,通過lowmem_reserve[DMA],限制來自HIGHMEM和Normal的分配請求。
內存域結構內的lowmem_reserve數組由 setup_per_zone_lowmem_reserve初始化,該函數在內核啓動或者系統調用時期被調用。其定義在mm/page_alloc.c#L6844 * setup_per_zone_lowmem_reserve - called whenever
* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
enum zone_type j, idx;
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long managed_pages = zone->managed_pages;
zone->lowmem_reserve[j] = 0;
idx = j;
while (idx) {
struct zone *lower_zone;
idx--;
if (sysctl_lowmem_reserve_ratio[idx] < 1)
sysctl_lowmem_reserve_ratio[idx] = 1;
lower_zone = pgdat->node_zones + idx;
lower_zone->lowmem_reserve[j] = managed_pages /
sysctl_lowmem_reserve_ratio[idx];
managed_pages += lower_zone->managed_pages;
}
}
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
- lowmem_reserve計算的基本算法是根據高級別內存域的可用頁框數除以一個ratio。例如lowmem_reserve[ZONE_DMA] = (managed_pages sum of ZONE_NORMAL and ZONE_HIGH, etc) / sysctl_lowmem_reserve_ratio[ZONE_DMA]
- calculate_totalreserve_pages用於更新每個結點的totalreserve_pages參數
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
#ifdef CONFIG_ZONE_DMA
256,
#endif
#ifdef CONFIG_ZONE_DMA32
256,
#endif
#ifdef CONFIG_HIGHMEM
32,
#endif
32,
};
2.3 冷熱頁-每CPU頁高速緩存
結構zone中的pageset成員用於實現冷熱分配器(Hot-N-Cold Pages),也叫做每CPU頁框高速緩存。 內核說頁是熱的,意味着頁已經加載到CPU高速緩存。相反,冷頁則不在高速緩存中。在SMP系統中每個CPU都已一個或多個高速緩存,因此每個CPU的管理都是獨立的。
struct zone {
struct per_cpu_pageset __percpu *pageset; //每CPU高速頁框
};
pageset是一個指針,指向的類型是per_cpu_pageset數組,定義在include/linux/mmzone.h#L275struct per_cpu_pageset {
struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
s8 expire;
#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};
其中struct per_cpu_pages是核心結構體,定義在include/linux/mmzone.h#L266
struct per_cpu_pages {
/* 列表中的頁框總數 */
int count; /* number of pages in the list */
/* 水印值,當count值超過high時,需要清理 */
int high; /* high watermark, emptying needed */
/* CPU高速緩存從buddy系統一次添加或刪除的頁框大小 */
int batch; /* chunk size for buddy add/remove */
/* Lists of pages, one per migrate type stored on the pcp-lists */
/* 每種遷移類型各自有獨立的冷熱鏈表。熱頁在列表頭,冷頁在列表尾。*/
struct list_head lists[MIGRATE_PCPTYPES];
};
2.4 zoonlist
當頁框申請情況沒有辦法在本內存域得到滿足時,內核需要考慮從其他結點或其他內存域去申請。每個結點pg_data_t內的node_zoonlists結構數組定義了這個嘗試申請的順序。它按照一種方式定義了不同內存域的內存的優先級關係。
typedef struct pglist_data {
struct zonelist node_zonelists[MAX_ZONELISTS];
}pg_data_t;
內核定義了內存的一個層次結構, 首先試圖分配”廉價的”內存. 如果失敗, 則根據訪問速度和容量, 逐漸嘗試分配”更昂貴的”內存.
- 高端內存是最廉價的,因爲內核沒有任何部份依賴於從該內存域分配的內存。如果高端內存域用盡,對內核沒有任何副作用,這也是優先分配高端內存的原因。
- 其次是普通內存域, 這種情況有所不同。 許多內核數據結構必須保存在該內存域,而不能放置到高端內存域。因此如果普通內存完全用盡, 那麼內核會面臨緊急情況。所以只要高端內存域的內存沒有用盡,都不會從普通內存域分配內存。
- 最昂貴的是DMA內存域,因爲它用於外設和系統之間的數據傳輸。因此從該內存域分配內存是最後一招。
2.4.1 zoonlist類型
enum {
ZONELIST_FALLBACK, /* zonelist with fallback */
#ifdef CONFIG_NUMA
/*
* The NUMA zonelists are doubled because we need zonelists that
* restrict the allocations to a single node for __GFP_THISNODE.
*/
ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
#endif
MAX_ZONELISTS
};
- ZONELIST_FALLBACK是備用列表
- ZONELIST_NOFALLBACK是當前結點的備用列表
2.4.2 zoonlist結構
zoonlist結構定義在linux/mmzone.h#L580
struct zonelist {
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
對於每個結點內的每個內存域都有一個結構zoneref在zoonlist內。/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
結構zoneref定義在linux/mmzone.h#L561struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};
2.4.3 zoonlist order
/*
* zonelist_order:
* 0 = automatic detection of better ordering.
* 1 = order by ([node] distance, -zonetype)
* 2 = order by (-zonetype, [node] distance)
*
* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
* the same zonelist. So only NUMA can configure this param.
*/
#define ZONELIST_ORDER_DEFAULT 0
#define ZONELIST_ORDER_NODE 1
#define ZONELIST_ORDER_ZONE 2
2.4.4 zoonlist建立
函數build_all_zonelists負責初始化zoonlist。定義在mm/page_alloc.c#L5206
/*
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*
* __ref due to (1) call of __meminit annotated setup_zone_pageset
* [we're only called with non-NULL zone through __meminit paths] and
* (2) call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order(); //設置zoonlist order policy
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init(); //kernel init時執行建立zoon list
} else {
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
setup_zone_pageset(zone); //memory hot plug時執行建立新的zoon list
#endif
/* we have to stop all cpus to guarantee there is no user
of zonelist */
stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
* more accurate, but expensive to check per-zone. This check is
* made on memory-hotadd so a system can start with mobility
* disabled and enable it later
*/
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}