源碼爲 2.6.37內核,x86_64架構,內存模型爲Sparse Memory
paging_init函數在setup_arch函數中被調用,用於初始化所有節點的pg_data_t結構,以及節點對應的管理區zone結構,和page結構。
調用的大致過程爲:
start_kernel()
--> setup_arch()
--> paging_init()
--> free_area_init_nodes()
--> free_area_init_node()
--> free_area_init_core()
--> memmap_init()
具體如下:
paging_init()在setup_arch()中被調用,定義爲:
- void __init paging_init(void)
- {
- unsigned long max_zone_pfns[MAX_NR_ZONES]; /* MAX_NR_ZONES = 4 */
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; /* 16M */
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; /* 4G */
- max_zone_pfns[ZONE_NORMAL] = max_pfn; /* 最大物理內存 */
- /* 爲所有node所擁有的物理page建立mem_sections
- * 將放入early_node_map中所有節點的memory region的page放入另外一個mem_section的數組中。
- * 使用mem_section目的可能是內存分配更加高效。
- * kernel中一個mem_section包含了物理地址相連的固定數目個page。
- * 給定一個物理地址,我們可以得到它所在的page,也能得到它所在的mem_section的下標。
- * 而sparse_memory_present_with_active_regions的作用就是爲給定node所擁有的物理page建立mem_sections.
- */
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
- sparse_init();
- /*
- * clear the default setting with node 0
- * note: don't use nodes_clear here, that is really clearing when
- * numa support is not compiled in, and later node_set_state
- * will not set it back.
- */
- node_clear_state(0, N_NORMAL_MEMORY);
- /* 初始化所有pg_data_t和zone、page的數據 */
- free_area_init_nodes(max_zone_pfns);
- }
paging_init()調用了free_area_init_nodes函數初始化所有結點的pg_data_t和zone、page的數據,並打印了管理區信息:
- /**
- * free_area_init_nodes - Initialise all pg_data_t and zone data
- * @max_zone_pfn: an array of max PFNs for each zone
- *
- * This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by add_active_range(), the size of each
- * zone in each node and their holes is calculated. If the maximum PFN
- * between two adjacent zones match, it is assumed that the zone is empty.
- * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
- * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
- * starts where the previous one ended. For example, ZONE_DMA32 starts
- * at arch_max_dma_pfn.
- */
- /* 初始化各個節點的所有pg_data_t和zone、page的數據 */
- void __init free_area_init_nodes(unsigned long *max_zone_pfn)
- {
- unsigned long nid;
- int i;
- /* Sort early_node_map as initialisation assumes it is sorted */
- sort_node_map();
- /*
- * 以下設置arch_zone_lowest_possible_pfn和arch_zone_highest_possible_pfn爲各個管理區的邊界
- * 即 arch_zone_lowest_possible_pfn = {最低pfn-0或64K, 16M , 4G , 0 }
- * arch_zone_highest_possible_pfn = { 16M , 4G , max_pfn, 0 }
- * { DMA , DMA_32, NORMAL , MOVABLE}
- */
- /* Record where the zone boundaries are */
- memset(arch_zone_lowest_possible_pfn, 0,
- sizeof(arch_zone_lowest_possible_pfn));
- memset(arch_zone_highest_possible_pfn, 0,
- sizeof(arch_zone_highest_possible_pfn));
- /* 查找early_node_map中的最低pfn */
- arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
- arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; /* max_zone_pfn = {16M, 4G, max_pfn} */
- for (i = 1; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
- continue;
- arch_zone_lowest_possible_pfn[i] =
- arch_zone_highest_possible_pfn[i-1];
- arch_zone_highest_possible_pfn[i] =
- max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
- }
- arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; /* 現在arch_zone[ZONE_MOVABLE]暫時是空的 */
- arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
- /* Find the PFNs that ZONE_MOVABLE begins at in each node */
- /* 爲每一個cpu node建立zone_movable_pfn。
- * ZONE_MOVABLE是在kernel啓動時由命令行傳入的參數,
- * 意義在於指明內核空間中哪些page是可以移動的,
- * 其他的內核page則稱爲kernel core,是不可以移動的。
- * find_zone_movable_pfns_for_nodes的作用就是按照
- * early_node_map根據每個node的不同內存分佈計算出
- * 每一個node中movable page的數量.
- */
- memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
- find_zone_movable_pfns_for_nodes(zone_movable_pfn);
- /* 打印DMA、DMA_32、ZONE_NORMAL的信息 */
- /* Print out the zone ranges */
- printk("Zone PFN ranges:\n");
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
- continue;
- printk(" %-8s ", zone_names[i]);
- if (arch_zone_lowest_possible_pfn[i] ==
- arch_zone_highest_possible_pfn[i])
- printk("empty\n");
- else
- printk("%0#10lx -> %0#10lx\n",
- arch_zone_lowest_possible_pfn[i],
- arch_zone_highest_possible_pfn[i]);
- }
- /* 打印ZONE_MOVABLE的信息 */
- /* Print out the PFNs ZONE_MOVABLE begins at in each node */
- printk("Movable zone start PFN for each node\n");
- for (i = 0; i < MAX_NUMNODES; i++) {
- if (zone_movable_pfn[i])
- printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
- }
- /* 打印 early_node_map 數組的信息 */
- /* Print out the early_node_map[] */
- printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
- for (i = 0; i < nr_nodemap_entries; i++)
- printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid,
- early_node_map[i].start_pfn,
- early_node_map[i].end_pfn);
- /* Initialise every node */
- mminit_verify_pageflags_layout();
- setup_nr_node_ids();
- /* 對所有節點循環 */
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid); /* 獲得節點的pg_data_t結構 */
- /* 調用free_area_init_node初始化節點nid對應的pg_data_t和zone、page的數據 */
- free_area_init_node(nid, NULL,
- find_min_pfn_for_node(nid), NULL); /* find_min_pfn_for_node函數從
- early_node_map數組中找出該節點的最低pfn */
- /* Any memory on that node */
- if (pgdat->node_present_pages)
- node_set_state(nid, N_HIGH_MEMORY); /* 設置該節點擁有regular memory */
- check_for_regular_memory(pgdat); /* 沒定義CONFIG_HIGHMEM函數爲空 */
- }
- }
而在free_area_init_nodes函數中通過循環遍歷各個節點,循環中調用了free_area_init_node函數初始化該節點對應的pg_data_t和zone、page的數據 :
- /* 初始化節點nid對應的pg_data_t和zone、page的數據
- * @ nid 爲節點標識符
- * @ zone_size 爲null
- * @ node_start_pfn 爲nid節點的起始pfn
- * @ zholes_size 爲null
- */
- void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
- unsigned long node_start_pfn, unsigned long *zholes_size)
- {
- pg_data_t *pgdat = NODE_DATA(nid); /* 獲得該節點的pg_data_t結構 */
- pgdat->node_id = nid; /* 設置節點標識符 */
- pgdat->node_start_pfn = node_start_pfn; /* 設置節點中第一個頁框的下標 */
- /* 計算對於該node來說有多少pages可用,
- * 設置pg_data_t中node_spanned_pages成員爲節點總的頁框數,包括洞
- * node_present_pages成員爲總的頁框數,不包括洞
- */
- calculate_node_totalpages(pgdat, zones_size, zholes_size);
- /* 在沒有定義CONFIG_FLAT_NODE_MEM_MAP的情況下,此函數沒作用 */
- alloc_node_mem_map(pgdat);
- #ifdef CONFIG_FLAT_NODE_MEM_MAP
- printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
- nid, (unsigned long)pgdat,
- (unsigned long)pgdat->node_mem_map);
- #endif
- /* 調用free_area_init_core繼續初始化pg_data_t結構,初始化zone以及page結構 */
- free_area_init_core(pgdat, zones_size, zholes_size);
- }
繼續調用free_area_init_core函數,繼續初始化該節點的pg_data_t結構,初始化zone以及page結構 ,
free_area_init_core函數是初始化zone的核心:
- /*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- */
- /* 繼續初始化pg_data_t結構,初始化zone以及page結構 */
- static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long *zholes_size)
- {
- enum zone_type j;
- int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
- int ret;
- pgdat_resize_init(pgdat); /* 初始化pgdat->node_size_lock自旋鎖 */
- pgdat->nr_zones = 0;
- init_waitqueue_head(&pgdat->kswapd_wait); /* 初始化pgdat->kswapd_wait等待隊列 */
- pgdat->kswapd_max_order = 0; /* 初始化頁換出守護進程創建空閒塊的大小,爲2^kswapd_max_order */
- pgdat_page_cgroup_init(pgdat); /* 空函數 */
- /* 遍歷每個管理區 */
- for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize, memmap_pages;
- enum lru_list l;
- /* size爲該管理區中的頁框數,包括洞 */
- size = zone_spanned_pages_in_node(nid, j, zones_size);
- /* realsize爲管理區中的頁框數,不包括洞 */
- realsize = size - zone_absent_pages_in_node(nid, j,
- zholes_size);
- /*
- * Adjust realsize so that it accounts for how much memory
- * is used by this zone for memmap. This affects the watermark
- * and per-cpu initialisations
- */
- /* 調整realsize的大小,即減去page結構體佔用的內存大小 */
- memmap_pages = /* memmap_pags爲包括洞的所有頁框的page結構體所佔的大小 */
- PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
- if (realsize >= memmap_pages) {
- realsize -= memmap_pages;
- if (memmap_pages)
- printk(KERN_DEBUG
- " %s zone: %lu pages used for memmap\n",
- zone_names[j], memmap_pages);
- } else /* 內存不夠存放page結構體 */
- printk(KERN_WARNING
- " %s zone: %lu pages exceeds realsize %lu\n",
- zone_names[j], memmap_pages, realsize);
- /* 調整realsize的大小,即減去DMA保留頁的大小 */
- /* Account for reserved pages */
- if (j == 0 && realsize > dma_reserve) {
- realsize -= dma_reserve;
- printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
- zone_names[0], dma_reserve);
- }
- if (!is_highmem_idx(j))
- nr_kernel_pages += realsize;
- nr_all_pages += realsize;
- zone->spanned_pages = size; /* 設置zone->spanned_pages爲包括洞的頁框數 */
- zone->present_pages = realsize; /* 設置zone->present+pages爲不包括洞的頁框數 */
- #ifdef CONFIG_NUMA
- zone->node = nid; /* 設置zone中的節點標識符 */
- /* 設置可回收頁面比率 */
- zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
- / 100;
- /* 設置slab回收緩存頁的比率 */
- zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
- #endif
- zone->name = zone_names[j]; /* 設置zone的名稱 */
- /* 初始化各種鎖 */
- spin_lock_init(&zone->lock);
- spin_lock_init(&zone->lru_lock);
- zone_seqlock_init(zone);
- zone->zone_pgdat = pgdat; /* 設置管理區屬於的節點對應的pg_data_t結構 */
- zone_pcp_init(zone); /* 初始化cpu的頁面緩存 */
- /* 初始化lru相關成員 */
- for_each_lru(l) {
- INIT_LIST_HEAD(&zone->lru[l].list);
- zone->reclaim_stat.nr_saved_scan[l] = 0;
- }
- zone->reclaim_stat.recent_rotated[0] = 0;
- zone->reclaim_stat.recent_rotated[1] = 0;
- zone->reclaim_stat.recent_scanned[0] = 0;
- zone->reclaim_stat.recent_scanned[1] = 0;
- zap_zone_vm_stats(zone); /* 初始化zone->vm_stat爲0 */
- zone->flags = 0;
- if (!size)
- continue;
- set_pageblock_order(pageblock_default_order()); /* pageblock_default_order()返回9*/
- setup_usemap(pgdat, zone, size); /* 定義了CONFIG_SPARSEMEM該函數爲空 */
- /* 設置pgdat->nr_zones和zone->zone_start_pfn成員
- * 初始化zone->free_area成員
- * 初始化zone->wait_table相關成員
- */
- ret = init_currently_empty_zone(zone, zone_start_pfn,
- size, MEMMAP_EARLY);
- BUG_ON(ret);
- memmap_init(size, nid, j, zone_start_pfn); /* 初始化該zone對應的page結構 */
- zone_start_pfn += size; /* 調整zone_start_pfn爲下一個zone的起始頁面 */
- }
- }
free_area_init_core函數調用memmap_init函數來初始化page結構:
- #define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
- /*
- * Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
- * done. Non-atomic initialization, single-pass.
- */
- /* 初始化該zone對應的page結構體 */
- void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum memmap_context context)
- {
- struct page *page;
- unsigned long end_pfn = start_pfn + size;
- unsigned long pfn;
- struct zone *z;
- if (highest_memmap_pfn < end_pfn - 1) /* 調整最高mem_map的頁面數 */
- highest_memmap_pfn = end_pfn - 1;
- z = &NODE_DATA(nid)->node_zones[zone]; /* 取得zone的指針 */
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- /*
- * There can be holes in boot-time mem_map[]s
- * handed to this function. They do not
- * exist on hotplugged memory.
- */
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- }
- page = pfn_to_page(pfn); /* 獲得pfn對應的page結構, 此時page還未初始化 */
- set_page_links(page, zone, nid, pfn);/* 設置page->flags中關於zone、node、section的標誌位 */
- mminit_verify_page_links(page, zone, nid, pfn);/* DEBUG用,無視之 */
- init_page_count(page); /* 設置page->_count引用計數爲1 */
- reset_page_mapcount(page); /* 設置page->_mapcount爲-1 */
- SetPageReserved(page); /* 無此函數 */
- /*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made. Later some blocks near
- * the start are marked MIGRATE_RESERVE by
- * setup_zone_migrate_reserve()
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- */
- if ((z->zone_start_pfn <= pfn)
- && (pfn < z->zone_start_pfn + z->spanned_pages)
- && !(pfn & (pageblock_nr_pages - 1)))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- INIT_LIST_HEAD(&page->lru); /* 初始化lru鏈表 */
- #ifdef WANT_PAGE_VIRTUAL
- /* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
- /* 設置page->virtual爲頁框的虛擬地址 */
- set_page_address(page, __va(pfn << PAGE_SHIFT));
- #endif
- }
- }
這樣經過paging_init函數,pg_data_t、zone、page等結構完成了初始化。