前面說到kmalloc時基於slab分配器來實現的,其分配的物理內存時連續的,但是kmalloc一次分配的內存不能太大,現在說vmalloc,vmalloc分配的虛擬內存時連續的,其分配的區間爲內存初始化時分配的從VMALLOC_START到VMALLOC_END區間,分配的虛擬內存時以PAGE_SIZE對齊的:
void *vmalloc(unsigned long size)
{
return __vmalloc_node_flags(size, NUMA_NO_NODE,
GFP_KERNEL | __GFP_HIGHMEM);
}
vmalloc函數的調用關係爲:
vmalloc()->__vmalloc_node_flags()->__vmalloc_node()->__vmalloc_node_range():
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller)
{
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
size = PAGE_ALIGN(size);------------------(1)
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
goto fail;
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);-------(2)
if (!area)
goto fail;
addr = __vmalloc_area_node(area, gfp_mask, prot, node);--------------(4)
if (!addr)
return NULL;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
*/
clear_vm_uninitialized_flag(area);
/*
* A ref_count = 2 is needed because vm_struct allocated in
* __get_vm_area_node() contains a reference to the virtual address of
* the vmalloc'ed block.
*/
kmemleak_alloc(addr, real_size, 2, gfp_mask);
return addr;
fail:
warn_alloc_failed(gfp_mask, 0,
"vmalloc: allocation failure: %lu bytes\n",
real_size);
return NULL;
}
(1)從這裏可以看出vmalloc分配內存時以頁大小對齊來分配的,即使之分配10Byte大小內存,實際也會分配一頁。
(2)vmalloc的核心函數,主要用於初始化vm_struct結構體門後面將會講到
(3)vmalloc核心函數,主要負責分配頁面,並建立從虛擬地址到物理地址的映射
先看__get_vm_area_node():
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP)
align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
size = PAGE_ALIGN(size);--------------(1)
if (unlikely(!size))
return NULL;
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);--------(2)
if (unlikely(!area))
return NULL;
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);-----------------(3)
if (IS_ERR(va)) {
kfree(area);
return NULL;
}
setup_vmalloc_vm(area, va, flags, caller);------------(4)
return area;
}
(1)再次確認分配size爲頁對齊
(2)分配一個vmap_area結構體
(3)此函數比較複雜,涉及到紅黑樹等數據結構,主要用來從vmalloc area中去尋找一塊合適的內存用於內存分配
(4)設置vm_struct以及vmap_area結構體
alloc_vmap_area():
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
struct rb_node *n;
unsigned long addr;
int purged = 0;
struct vmap_area *first;
BUG_ON(!size);
BUG_ON(size & ~PAGE_MASK);
BUG_ON(!is_power_of_2(align));
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);----------(1)
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
/*
* Only scan the relevant parts containing pointers to other objects
* to avoid false negatives.
*/
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
retry:
spin_lock(&vmap_area_lock);
/*
* Invalidate cache if we have more permissive parameters.
* cached_hole_size notes the largest hole noticed _below_
* the vmap_area cached in free_vmap_cache: if size fits
* into that hole, we want to scan from vstart to reuse
* the hole instead of allocating above free_vmap_cache.
* Note that __free_vmap_area may update free_vmap_cache
* without updating cached_hole_size or cached_align.
*/
if (!free_vmap_cache ||
size < cached_hole_size ||
vstart < cached_vstart ||
align < cached_align) {
nocache:
cached_hole_size = 0;
free_vmap_cache = NULL;
}
/* record if we encounter less permissive parameters */
cached_vstart = vstart;
cached_align = align;
/* find starting point for our search */
if (free_vmap_cache) {
first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
addr = ALIGN(first->va_end, align);
if (addr < vstart)
goto nocache;
if (addr + size < addr)
goto overflow;
} else {
addr = ALIGN(vstart, align);-------------(2)
if (addr + size < addr)
goto overflow;
n = vmap_area_root.rb_node;---------(3)
first = NULL;
while (n) {
struct vmap_area *tmp;
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_end >= addr) {----------(4)
first = tmp;
if (tmp->va_start <= addr)
break;
n = n->rb_left;
} else
n = n->rb_right;
}
if (!first)
goto found;
}
/* from the starting point, walk areas until a suitable hole is found */
while (addr + size > first->va_start && addr + size <= vend) {-----------(5)
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
addr = ALIGN(first->va_end, align);
if (addr + size < addr)
goto overflow;
if (list_is_last(&first->list, &vmap_area_list))------------(6)
goto found;
first = list_entry(first->list.next,
struct vmap_area, list);
}
found:
if (addr + size > vend)
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
__insert_vmap_area(va);-------------------(7)
free_vmap_cache = &va->rb_node;
spin_unlock(&vmap_area_lock);
BUG_ON(va->va_start & (align-1));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
return va;
overflow:
spin_unlock(&vmap_area_lock);
if (!purged) {
purge_vmap_area_lazy();
purged = 1;
goto retry;
}
if (printk_ratelimit())
printk(KERN_WARNING
"vmap allocation for size %lu failed: "
"use vmalloc=<size> to increase size.\n", size);
#ifdef CONFIG_HTC_DEBUG_VMALLOC_DUMP
if((last_dump_jiffies == 0) || time_is_before_jiffies(last_dump_jiffies + DUMP_VMALLOC_INTERVAL)) {
dump_vmallocinfo();
last_dump_jiffies = jiffies;
}
#endif
kfree(va);
return ERR_PTR(-EBUSY);
}
(1)分配一個vmap_area結構體
(2)將初始地址設置爲VMALLOC_START對齊到align後的地址
(3)從紅黑樹vmap_area_root開始搜索整個紅黑樹,找到滿足要求的內存塊,如果此紅黑樹沒有節點,說明系統沒有使用vmalloc分配內存空間。
(4)找到起始地址最小的內存塊,由於此時addr爲VMALLOC_START值,固此if語句會一直滿足條件,直到找到最小的內存塊且其左子節點爲空,此時就找到滿足要求的子節點。示意圖如下:
(5)while循環負責從前面已經分配了的各vmalloc的區間中的縫隙是否有合適size用於滿足當前內存分配要求,如果有則分配,如果沒有則一直到所有已分配的內存模塊的末尾分配size大小的內存。
(6)判斷當前的vmlloc節點是否爲vmap_area_list的最後一個節點,如果是則跳出循環,在此內存塊後面分配內存,管理vmalloc已分配的內存塊一個是vmap_area_root紅黑樹,還有一個是雙向鏈表vmap_area_list。
(7)將新分配的vmalloc節點添加到紅黑樹中。
__vmalloc_area_node():
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
const int order = 0;
struct page **pages;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;---------(1)
array_size = (nr_pages * sizeof(struct page *));---------(2)
area->nr_pages = nr_pages;-----------(3)
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {--------------(4)
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
PAGE_KERNEL, node, area->caller);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;-----------(5)
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
for (i = 0; i < area->nr_pages; i++) {----------(6)
struct page *page;
if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask);
else
page = alloc_pages_node(node, alloc_mask, order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
goto fail;
}
area->pages[i] = page;--------------(7)
if (gfp_mask & __GFP_WAIT)
cond_resched();
}
if (map_vm_area(area, prot, pages))-------------(8)
goto fail;
return area->addr;
fail:
warn_alloc_failed(gfp_mask, order,
"vmalloc: allocation failure, allocated %ld of %ld bytes\n",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
}
(1)計算當前分配內存的頁數。
(2)計算管理當前分配的頁面所需要內存大小。
(3)vm_struct部分成員初始化,nr_pages表示結構體所管理的內存大小的頁數。
(4)如果管理內存塊所需要的內存大於一個頁面就使用vmalloc_node分配,否則可以使用kmalloc分配。
(5)將內存管理區的首地址賦給vm_struct的pages成員,pages數組成員存放的是管理各內存頁面的struct page的結構體的首地址。
(6)for循環使用alloc_pages分配頁面。
(7)如(5)所述,將分配到的頁面管理結構體struct page賦值給pages數組成員。
(8)建立所分配虛擬內存到物理內存的映射,由此可以看出vmalloc分配內存是在分配時建立的內存映射。
map_vm_area()->vmap_page_range_noflush():
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
unsigned long next;
unsigned long addr = start;
int err = 0;
int nr = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);-----------(1)
do {
next = pgd_addr_end(addr, end);
err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
return nr;
}
(1)熟悉的節奏,和內存初始化建立虛擬內存到物理內存的映射原理一致,最後調用set_pte_at()將page的頁幀號以及頁面的屬性填寫到pte所在的地址裏面。其中page到頁幀號的轉換流程爲page->virtual address->pfn.
建立好虛擬地址到物理地址的映射後,到此vmalloc分配內存的流程結束