node - Linux內存管理(2)



日期 內核版本 架構 作者 GitHub CSDN
2017-07-04 Linux-4.12 X86 lwhuq LinuxMemoryStudy Linux內存管理


  在NUMA多CPU架構下,每個CPU後面都有掛載本地內存,CPU之前通過總線連接。每個CPU在訪問當地內存的速度都會比訪問遠程內存速度快。Linux系統下把每個CPU的本地內存資源用一個結點node表示。


1 pg_data_t結構

  pg_data_t的定義在include/linux/mmzone.h#L601

typedef struct pglist_data {
	//一個結構數組,包含了結點中各內存域的數據結構zone
	struct zone node_zones[MAX_NR_ZONES];
	//指定了備用結點機器內存域的列表,以便在當前結點沒有可用空間時,在備用結點分配內存
	struct zonelist node_zonelists[MAX_ZONELISTS];
	//內存域的個數        
	int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
	//指向結點的第一個頁框的頁結構,該頁結構位於全局mem_map中某個位置
	struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
	struct page_ext *node_page_ext;
#endif
#endif
#ifndef CONFIG_NO_BOOTMEM
	//啓動內存分配器
	struct bootmem_data *bdata;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
	/*
	 * Must be held any time you expect node_start_pfn, node_present_pages
	 * or node_spanned_pages stay constant.  Holding this will also
	 * guarantee that any pfn_valid() stays that way.
	 *
	 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
	 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
	 *
	 * Nests above zone->lock and zone->span_seqlock
	 */
	spinlock_t node_size_lock;
#endif
	//結點起始頁框
	unsigned long node_start_pfn;
	//結點總頁框數(不包含洞)
	unsigned long node_present_pages; /* total number of physical pages */
	//結點總頁框數(包含洞)
	unsigned long node_spanned_pages; /* total size of physical page
					     range, including holes */
	//結點id
	int node_id;
	//交換守護進程的等待列表
	wait_queue_head_t kswapd_wait;
	//本結點交換守護進程
	wait_queue_head_t pfmemalloc_wait;
	struct task_struct *kswapd;	/* Protected by
					   mem_hotplug_begin/end() */
	int kswapd_order;
	enum zone_type kswapd_classzone_idx;

	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
	int kcompactd_max_order;
	enum zone_type kcompactd_classzone_idx;
	wait_queue_head_t kcompactd_wait;
	struct task_struct *kcompactd;
#endif
#ifdef CONFIG_NUMA_BALANCING
	/* Lock serializing the migrate rate limiting window */
	spinlock_t numabalancing_migrate_lock;

	/* Rate limiting time interval */
	unsigned long numabalancing_migrate_next_window;

	/* Number of pages migrated during the rate limiting time interval */
	unsigned long numabalancing_migrate_nr_pages;
#endif
	/*
	 * This is a per-node reserve of pages that are not available
	 * to userspace allocations.
	 */
	unsigned long		totalreserve_pages;

#ifdef CONFIG_NUMA
	/*
	 * zone reclaim becomes active if more unmapped pages exist.
	 */
	unsigned long		min_unmapped_pages;
	unsigned long		min_slab_pages;
#endif /* CONFIG_NUMA */

	/* Write-intensive fields used by page reclaim */
	ZONE_PADDING(_pad1_)
	spinlock_t		lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
	/*
	 * If memory initialisation on large machines is deferred then this
	 * is the first PFN that needs to be initialised.
	 */
	unsigned long first_deferred_pfn;
	unsigned long static_init_size;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	spinlock_t split_queue_lock;
	struct list_head split_queue;
	unsigned long split_queue_len;
#endif

	/* Fields commonly accessed by the page reclaim scanner */
	struct lruvec		lruvec;

	/*
	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
	 * this node's LRU.  Maintained by the pageout code.
	 */
	unsigned int inactive_ratio;

	unsigned long		flags;

	ZONE_PADDING(_pad2_)

	/* Per-node vmstats */
	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;

1.1 結點的內存域

  結點管理的內存再細分成內存域。
typedef struct pglist_data {
        //一個結構數組,包含了結點中各內存域的數據結構zone
 struct zone node_zones[MAX_NR_ZONES]; 
        //指定了備用結點機器內存域的列表,以便在當前結點沒有可用空間時,在備用結點分配內存
 struct zonelist node_zonelists[MAX_ZONELISTS];
        //內存域的個數        
 int nr_zones;
}

  • node_zones[MAX_NR_ZONES]管理着本地內存的最多MAX_NR_ZONES個內存域
  • node_zonelists[MAX_ZONELISTS]指定了備用結點及內存域的列表。可以想象這些備用結點及內存域都是遠程內存
  • nr_zones結點內存域的個數

1.2 結點的內存頁

typedef struct pglist_data {
#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
	struct page *node_mem_map;  //指向結點的第一個頁框的頁結構,該頁結構位於全局mem_map中某個位置
#ifdef CONFIG_PAGE_EXTENSION
	struct page_ext *node_page_ext;
#endif
#endif
 	//結點起始頁框
 	unsigned long node_start_pfn;
        //結點總頁框數(不包含洞)
 	unsigned long node_present_pages; /* total number of physical pages */
        //結點總頁框數(包含洞)
 	unsigned long node_spanned_pages; /* total size of physical page range, including holes */
} pg_data_t;

  在每個結點的結構pg_data_t內有一個指向頁結構page的指針node_mem_map。pg_data_t->node_mem_map指向本結點管理的物理內存頁框的第一個頁框。

typedef struct pglist_data {  
    //指向結點的第一個頁框的頁結構,該頁結構位於全局mem_map中某個位置  
    struct page *node_mem_map;  
}
  pg_data_t->node_mem_map的初始化在alloc_node_mem_map中完成,定義在mm/page_alloc.c#L6096
static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
	offset = pgdat->node_start_pfn - start;
	/* ia64 gets its own node_mem_map, before this, without bootmem */
	if (!pgdat->node_mem_map) {
		unsigned long size, end;
		struct page *map;

		/*
		 * The zone's endpoints aren't required to be MAX_ORDER
		 * aligned but the node_mem_map endpoints must be in order
		 * for the buddy allocator to function correctly.
		 */
		end = pgdat_end_pfn(pgdat);
		end = ALIGN(end, MAX_ORDER_NR_PAGES);
		size =  (end - start) * sizeof(struct page);
		map = alloc_remap(pgdat->node_id, size);
		if (!map)
			map = memblock_virt_alloc_node_nopanic(size,
							       pgdat->node_id);
		pgdat->node_mem_map = map + offset;
	}
}

1.3 交換守護進程

typedef struct pglist_data {
        //交換守護進程的等待列表
 	wait_queue_head_t kswapd_wait;wait_queue_head_t pfmemalloc_wait;
        //本結點交換守護進程
	struct task_struct *kswapd;	/* Protected by   mem_hotplug_begin/end() */
	int kswapd_order;
	enum zone_type kswapd_classzone_idx;
	int kswapd_failures; /* Number of 'reclaimed == 0' runs */
} pg_data_t;

2 結點狀態

  當系統中有超過一個結點時,內核會維護一個位圖node_states用以提供各個結點的狀態信息,其定義在include/linux/nodemask.h#L381
enum node_states {
	N_POSSIBLE,		/* The node could become online at some point */
	N_ONLINE,		/* The node is online */
	N_NORMAL_MEMORY,	/* The node has regular memory */
#ifdef CONFIG_HIGHMEM
	N_HIGH_MEMORY,		/* The node has regular or high memory */
#else
	N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
#ifdef CONFIG_MOVABLE_NODE
	N_MEMORY,		/* The node has memory(regular, high, movable) */
#else
	N_MEMORY = N_HIGH_MEMORY,
#endif
	N_CPU,		/* The node has one or more cpus */
	NR_NODE_STATES
};
  結點位圖的實例node_states定義在mm/page_alloc.c#L122, 當某個node處在某個狀態時,對應狀態位的node位就會被置起。
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
	[N_POSSIBLE] = NODE_MASK_ALL,
	[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
	[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
	[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
#ifdef CONFIG_MOVABLE_NODE
	[N_MEMORY] = { { [0] = 1UL } },
#endif
	[N_CPU] = { { [0] = 1UL } },
#endif	/* NUMA */
};
EXPORT_SYMBOL(node_states);

  • N_POSSIBLE, N_ONLINE和N_CPU用於CPU和內存的熱插拔
  • N_NORMAL_MEMORY, N_HIGH_MEMORY用於普通內存管理
  • N_MEMORY表示有物理內存的結點
  幾個輔助函數用於設置或清除位域或特定結點中的一個bit。定義在include/linux/nodemask.h#L407

static inline int node_state(int node, enum node_states state)
{
	return node_isset(node, node_states[state]);
}
static inline void node_set_state(int node, enum node_states state)
{
	__node_set(node, &node_states[state]);
}
static inline void node_clear_state(int node, enum node_states state)
{
	__node_clear(node, &node_states[state]);
}
static inline int num_node_state(enum node_states state)
{
	return nodes_weight(node_states[state]);
}

3 查找內存結點

  內存結點的實例爲node_data[MAX_NUMNODES],定義在arch/x86/mm/numa.c#L26
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);

  內存結點最大數目由MAX_NUMNODES決定,定義在include/linux/numa.h#L11

#ifdef CONFIG_NODES_SHIFT
#define NODES_SHIFT     CONFIG_NODES_SHIFT
#else
#define NODES_SHIFT     0
#define MAX_NUMNODES    (1 << NODES_SHIFT)
#endif

  宏NODE_DATA(nid)可以根據node id找到node_data結構實例,定義在arch/x86/include/asm/mmzone_32.h#L13arch/x86/include/asm/mmzone_64.h#L14
#define NODE_DATA(nid)		(node_data[nid])

3.1 查找node id

  宏first_online_node用於得到第一個online的node,定義在include/linux/nodemask.h#L430
#define first_online_node	first_node(node_states[N_ONLINE])
  宏 first_memory_node得到第一個有memory的node,定義在include/linux/nodemask.h#L431
#define first_memory_node	first_node(node_states[N_MEMORY])
  宏next_node(n, src)得到某個node state狀態src的下一個被置起的node id,定義在include/linux/nodemask.h#L258
#define next_node(n, src) __next_node((n), &(src))
static inline int __next_node(int n, const nodemask_t *srcp)
{
	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}
  函數next_online_node得到下一個online的node,定義在include/linux/nodemask.h#L432
static inline int next_online_node(int nid)
{
	return next_node(nid, node_states[N_ONLINE]);
}
  函數next_memory_node得到下一個有memory的node,定義在include/linux/nodemask.h#L436
static inline int next_memory_node(int nid)
{
	return next_node(nid, node_states[N_MEMORY]);
}

3.2 node id的遍歷

  宏for_each_node_state(__node, __state)用來遍歷處於特定狀態的所有結點,定義在include/linux/nodemask.h#L427
#define for_each_node_state(__node, __state) \
	for_each_node_mask((__node), node_states[__state])
  宏for_each_node(node)用來迭代處於N_POSSIBLE狀態的所有結點,定義在include/linux/nodemask.h#L507
#define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
  宏for_each_online_node(node)用來遍歷處於N_ONLINE所有結點,定義在include/linux/nodemask.h#L508
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

3.3 查找pg_data_t結構

  函數first_online_pgdat得到第一個online的pg_data結構的指針,定義在mm/mmzone.c#L12
struct pglist_data *first_online_pgdat(void)
{
	return NODE_DATA(first_online_node);
}
  函數next_online_pgdat(pgdat)得到下一個online的pg_data結構的指針,定義在mm/mmzone.c#L17
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
{
	int nid = next_online_node(pgdat->node_id);


	if (nid == MAX_NUMNODES)
		return NULL;
	return NODE_DATA(nid);
}

3.4 pg_data_t結構的遍歷

  宏for_each_online_pgdat(pgdat)用來遍歷所有online的pg_data_t結構指針,定義在include/linux/mmzone.h#L908
#define for_each_online_pgdat(pgdat)			\
	for (pgdat = first_online_pgdat();		\
	     pgdat;					\
	     pgdat = next_online_pgdat(pgdat))






發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章