日期 | 内核版本 | 架构 | 作者 | GitHub | CSDN |
---|---|---|---|---|---|
2017-07-04 | Linux-4.12 | X86 | lwhuq | LinuxMemoryStudy | Linux内存管理 |
1 zone 结构
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
/*
*页框换出使用的“水印“。当空闲页框多于page_max时,这个状态是理想的。
*当页框少于page_low时需要触发kswapd守护进程进行页面回收。
*当页框少于page_min说明页面回收压力很大,内核必须立即采取各种办法来缓解压力。
*/
unsigned long watermark[NR_WMARK];
unsigned long nr_reserved_highatomic;
/*
* We don't know if the memory that we're going to allocate will be
* freeable or/and it will be released eventually, so to avoid totally
* wasting several GB of ram we must reserve some of the lower zone
* memory (otherwise we risk to run OOM on the lower zones despite
* there being tons of freeable ram on the higher zones). This array is
* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
* changes.
*/
/*
* 指明各种内存域在处理内存不足的临界情况下内存域必须为上级内存域保留的页框数目,
* 同时也用于在中断或临界区发出的原子内存分配请求(就是禁止阻塞的内存分配请求)。
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat; //指向当前内存域父结点描述符结构
struct per_cpu_pageset __percpu *pageset; //每CPU冷热页
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn; //当前内存域第一个页框的页框号
/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.
*
* Read access to managed_pages should be safe because it's unsigned
* long. Write access to zone->managed_pages and totalram_pages are
* protected by managed_page_count_lock at runtime. Idealy only
* adjust_managed_page_count() should be used instead of directly
* touching zone->managed_pages and totalram_pages.
*/
unsigned long managed_pages; //当前内存域总页框数(包含洞)
unsigned long spanned_pages; //当前内存域总页框数(不包含洞)
unsigned long present_pages; //当前内存域总页框数(不包含洞)- 保留页框数
const char *name;
#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */
/* buddy伙伴系统,每个数组元素表示一个固定长度的连续页框 */
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags; //flags描述内存域的当前状态
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
#endif
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
1.1 ZONE_PADDING
该结构比较特殊的地方是它由ZONE_PADDING分隔的几个部分。这个是为了让一些有lock要求的数据处于不同的cache line。
zone结构内有些数据是需要锁的,即它们可能被多个CPU同时访问。zone结构内有两锁,一个是自旋锁zone->lock,另一个是顺序锁span_seqlock。
多处理系统cache还有一个cache一致性问题。当一个CPU对某个内存的cache line做出修改时,这个内存在别的CPU的cache line上面的数据就变成invalid的了,需要同步update。
当两个锁要保护的数据在同一个cache line时,两个CPU通过zone的两个锁去获得了对zone结构的访问权限,然后都对数据做修改。当他们要操作的数据在一个cache line里面的时候,每个CPU的cache line都会有一份该数据的缓存,当每一个CPU对数据做修改时都会导致另一个CPU cache里面的数据invalid而需要同步update。而zone结构的访问是非常频繁的,cache line的同步更新会对性能有很大的影响。
即便只有一个锁时,要锁保护的数据也最好是在一个cache line里面,只有需要同步update的数据只有一个cache line,而如果要锁保护的数据跨越了cache line的话,需要同步update的工作量就会翻翻。
对这个问题的解决办法就是通过ZONE_PADDING把不同锁要保护的数据放在不同的cache line。尽量减少不必要的cache line 同步update。
ZONE_PADDING的定义在include/linux/mmzone.h#L112
#if defined(CONFIG_SMP)
struct zone_padding {
char x[0];
} ____cacheline_internodealigned_in_smp;
#define ZONE_PADDING(name) struct zone_padding name;
#else
#define ZONE_PADDING(name)
#endif
内核还用了____cacheline_internodealigned_in_smp,来实现最优的高速缓存行对其方式,其定义在include/linux/cache.h#L70
#if !defined(____cacheline_internodealigned_in_smp)
#if defined(CONFIG_SMP)
#define ____cacheline_internodealigned_in_smp \
__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT))))
#else
#define ____cacheline_internodealigned_in_smp
#endif
#endif
2 水印
水印的类型zone_watermarks定义在include/linux/mmzone.h#L255
enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
NR_WMARK
};
可以把物理内存想象成一个水池。它有三个水印刻度,当水池的水达到最高水印刻度watermark[WMARK_HIGH],水池水很充足不用担心没水的问题。当水池的水少于watermark[WMARK_LOW]时,得赶紧打开水龙头放点水进来。当水少于watermark[WMARK_MIN]时,开一个水龙头放水都不管用了,得像更多的办法再拖个水管过来了。系统进程kswapd就是这个水龙头,用于页框交换回收。
#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
2.1 水印初始化
内存域结构内的水印(watermark数组)由setup_per_zone_wmarks初始化,该函数在内核启动或者min_free_kbytes被修改时期被系统调用。其定义在mm/page_alloc.c#L6945
void setup_per_zone_wmarks(void)
{
mutex_lock(&zonelists_mutex);
__setup_per_zone_wmarks();
mutex_unlock(&zonelists_mutex);
}
- 互斥量保证只时有一个线程可以修改水印参数
- setup_per_zone_wmarks真正实现是在__setup_per_zone_wmarks中。
static void __setup_per_zone_wmarks(void)
{
/*
* min_free_kbytes是全局系统参数,单位是KB
* pages_min单位是页框大小
*/
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
/* 计算所有非ZONE_HIGHMEM内存域的可用内存页框总数,不包括洞,也不包括预留内存 */
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->managed_pages;
}
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags); //操作共享资源前先获得锁
tmp = (u64)pages_min * zone->managed_pages;
/*
* tmp = tmp/lowmem_pages
* = (u64)pages_min * zone->managed_pages / (zone->managed_pages + otherzone->managed_pages)
* 意思就是把全局系统参数min_free_kbytes按照每个内存域managed_pages页框数比例分配到每个内存域
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* need highmem pages, so cap pages_min to a small
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
min_pages = zone->managed_pages / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
zone->watermark[WMARK_MIN] = tmp;
}
/*
* Set the kswapd watermarks distance according to the
* scale factor in proportion to available memory, but
* ensure a minimum size on small systems.
*/
tmp = max_t(u64, tmp >> 2,
mult_frac(zone->managed_pages,
watermark_scale_factor, 10000));
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
spin_unlock_irqrestore(&zone->lock, flags);
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
- 对于ZONE_HIGHMEM内存域,水印不是特别重要,把watermark[WMARK_MIN]设置成一个较小的值即可。并限制在32个页框到128个页框之间。
#define SWAP_CLUSTER_MAX 32UL
- 对于非ZONE_HIGHMEM内存域,watermark[WMARK_MIN]就是min_free_kbytes按各内存域比例,并把单位转换成页框。
- watermark[WMARK_LOW] = watermark[WMARK_MIN]
+ 1/4 max(min_free_kbytes per zone, managed_pages/1000)
- watermark[WMARK_HIGH] = watermark[WMARK_MIN] +1/2 max(min_free_kbytes per zone, managed_pages/1000)
- calculate_totalreserve_pages用于更新每个结点的totalreserve_pages参数。
- setup_zone_migrate_reserve函数被删除了?
2.2 lowmem_reserve
kernel在分配内存时,如果失败就会尝试下一个低级的zone(这里的低级仅仅指zone内存的位置,实际上低地址zone是更稀缺的资源)。我们可以想像应用进程通过内存映射申请Highmem 并且加mlock分配,如果此时Highmem zone无法满足分配,则会尝试从Normal进行分配。这就有一个问题,来自Highmem的请求可能会耗尽Normal zone的内存,而且由于mlock又无法回收,最终的结果就是Normal zone无内存提供给kernel的正常分配,而Highmem有大把的可回收内存无法有效利用。因此针对这个case,使得Normal zone在碰到来自Highmem的分配请求时,可以通过lowmem_reserve声明:可以使用我的内存,但是必须要保留lowmem_reserve[NORMAL]给我自己使用。同样当从Normal失败后,会尝试从zonelist中的DMA申请分配,通过lowmem_reserve[DMA],限制来自HIGHMEM和Normal的分配请求。
内存域结构内的lowmem_reserve数组由 setup_per_zone_lowmem_reserve初始化,该函数在内核启动或者系统调用时期被调用。其定义在mm/page_alloc.c#L6844 * setup_per_zone_lowmem_reserve - called whenever
* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
enum zone_type j, idx;
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long managed_pages = zone->managed_pages;
zone->lowmem_reserve[j] = 0;
idx = j;
while (idx) {
struct zone *lower_zone;
idx--;
if (sysctl_lowmem_reserve_ratio[idx] < 1)
sysctl_lowmem_reserve_ratio[idx] = 1;
lower_zone = pgdat->node_zones + idx;
lower_zone->lowmem_reserve[j] = managed_pages /
sysctl_lowmem_reserve_ratio[idx];
managed_pages += lower_zone->managed_pages;
}
}
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
- lowmem_reserve计算的基本算法是根据高级别内存域的可用页框数除以一个ratio。例如lowmem_reserve[ZONE_DMA] = (managed_pages sum of ZONE_NORMAL and ZONE_HIGH, etc) / sysctl_lowmem_reserve_ratio[ZONE_DMA]
- calculate_totalreserve_pages用于更新每个结点的totalreserve_pages参数
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
#ifdef CONFIG_ZONE_DMA
256,
#endif
#ifdef CONFIG_ZONE_DMA32
256,
#endif
#ifdef CONFIG_HIGHMEM
32,
#endif
32,
};
2.3 冷热页-每CPU页高速缓存
结构zone中的pageset成员用于实现冷热分配器(Hot-N-Cold Pages),也叫做每CPU页框高速缓存。 内核说页是热的,意味着页已经加载到CPU高速缓存。相反,冷页则不在高速缓存中。在SMP系统中每个CPU都已一个或多个高速缓存,因此每个CPU的管理都是独立的。
struct zone {
struct per_cpu_pageset __percpu *pageset; //每CPU高速页框
};
pageset是一个指针,指向的类型是per_cpu_pageset数组,定义在include/linux/mmzone.h#L275struct per_cpu_pageset {
struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
s8 expire;
#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};
其中struct per_cpu_pages是核心结构体,定义在include/linux/mmzone.h#L266
struct per_cpu_pages {
/* 列表中的页框总数 */
int count; /* number of pages in the list */
/* 水印值,当count值超过high时,需要清理 */
int high; /* high watermark, emptying needed */
/* CPU高速缓存从buddy系统一次添加或删除的页框大小 */
int batch; /* chunk size for buddy add/remove */
/* Lists of pages, one per migrate type stored on the pcp-lists */
/* 每种迁移类型各自有独立的冷热链表。热页在列表头,冷页在列表尾。*/
struct list_head lists[MIGRATE_PCPTYPES];
};
2.4 zoonlist
当页框申请情况没有办法在本内存域得到满足时,内核需要考虑从其他结点或其他内存域去申请。每个结点pg_data_t内的node_zoonlists结构数组定义了这个尝试申请的顺序。它按照一种方式定义了不同内存域的内存的优先级关系。
typedef struct pglist_data {
struct zonelist node_zonelists[MAX_ZONELISTS];
}pg_data_t;
内核定义了内存的一个层次结构, 首先试图分配”廉价的”内存. 如果失败, 则根据访问速度和容量, 逐渐尝试分配”更昂贵的”内存.
- 高端内存是最廉价的,因为内核没有任何部份依赖于从该内存域分配的内存。如果高端内存域用尽,对内核没有任何副作用,这也是优先分配高端内存的原因。
- 其次是普通内存域, 这种情况有所不同。 许多内核数据结构必须保存在该内存域,而不能放置到高端内存域。因此如果普通内存完全用尽, 那么内核会面临紧急情况。所以只要高端内存域的内存没有用尽,都不会从普通内存域分配内存。
- 最昂贵的是DMA内存域,因为它用于外设和系统之间的数据传输。因此从该内存域分配内存是最后一招。
2.4.1 zoonlist类型
enum {
ZONELIST_FALLBACK, /* zonelist with fallback */
#ifdef CONFIG_NUMA
/*
* The NUMA zonelists are doubled because we need zonelists that
* restrict the allocations to a single node for __GFP_THISNODE.
*/
ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
#endif
MAX_ZONELISTS
};
- ZONELIST_FALLBACK是备用列表
- ZONELIST_NOFALLBACK是当前结点的备用列表
2.4.2 zoonlist结构
zoonlist结构定义在linux/mmzone.h#L580
struct zonelist {
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
对于每个结点内的每个内存域都有一个结构zoneref在zoonlist内。/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
结构zoneref定义在linux/mmzone.h#L561struct zoneref {
struct zone *zone; /* Pointer to actual zone */
int zone_idx; /* zone_idx(zoneref->zone) */
};
2.4.3 zoonlist order
/*
* zonelist_order:
* 0 = automatic detection of better ordering.
* 1 = order by ([node] distance, -zonetype)
* 2 = order by (-zonetype, [node] distance)
*
* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
* the same zonelist. So only NUMA can configure this param.
*/
#define ZONELIST_ORDER_DEFAULT 0
#define ZONELIST_ORDER_NODE 1
#define ZONELIST_ORDER_ZONE 2
2.4.4 zoonlist建立
函数build_all_zonelists负责初始化zoonlist。定义在mm/page_alloc.c#L5206
/*
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*
* __ref due to (1) call of __meminit annotated setup_zone_pageset
* [we're only called with non-NULL zone through __meminit paths] and
* (2) call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
*/
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order(); //设置zoonlist order policy
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init(); //kernel init时执行建立zoon list
} else {
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
setup_zone_pageset(zone); //memory hot plug时执行建立新的zoon list
#endif
/* we have to stop all cpus to guarantee there is no user
of zonelist */
stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
* more accurate, but expensive to check per-zone. This check is
* made on memory-hotadd so a system can start with mobility
* disabled and enable it later
*/
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}