進程地址空間 get_unmapped_area()
在向數據結構插入新的內存區域之前,內核必須確認虛擬地址空間中有足夠的空閒空間,可用於給定長度的區域。該工作由get_unmmaped_area()完成。
在分析get_unmmaped_area()之前,先簡單介紹一下進程地址空間的佈局。
進程地址空間 經典佈局:
經典佈局的缺點:在x86_32,虛擬地址空間從0到0xc0000000,每個用戶進程有3GB可用。TASK_UNMAPPED_BASE一般起始於0x4000000(即1GB)。這意味着堆只有1GB的空間可供使用,繼續增長則進入到mmap區域。這時mmap區域是自底向上擴展的。
針對這個問題,引入了新的虛擬地址空間:
與經典佈局不同的是:使用固定值限制棧的最大長度。由於棧是有界的,因此安置內存映射的區域可以在棧末端的下方立即開始。這時mmap區是自頂向下擴展的。由於堆仍然位於虛擬地址空間中較低的區域並向上增長,因此mmap區域和堆可以相對擴展,直至耗盡虛擬地址空間中剩餘的區域。
選擇佈局的工作由arch_pick_mmap_layout完成。其中arch_get_unmapped_area()完成從低地址向高地址創建新的映射,而arch_get_unmapped_area_topdown()完成從高地址向低地址創建新的映射。
include/linux/sched.h
...
#ifdef CONFIG_MMU
extern void arch_pick_mmap_layout(struct mm_struct *mm);
...
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
#endif
mm/util.c
...
/* HAVE_ARCH_PICK_MMAP_LAYOUT : 體系結構是否想要在不同mmap區域佈局之間做出選擇 */
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
/* 經典佈局 */
void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
mm->get_unmapped_area = arch_get_unmapped_area;
}
#endif
arch/x86/mm/mmap.c
...
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
unsigned long random_factor = 0UL;
/*
* 設置了PF_RANDOMEIZE, 則內核不會爲棧和內存映射的起點選擇固定
* 位置,而是在每次新進程啓動時,隨機改變這些值的設置
*/
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
mm->mmap_legacy_base = mmap_legacy_base(random_factor);
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
現在我們看看get_unmapped_area()中的一些細節。
unsigned long get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len,unsigned long pgoff, unsigned long flags)
{
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
unsigned long error = arch_mmap_check(addr, len, flags);
if (error)
return error;
/* Careful about overflows.. */
if (len > TASK_SIZE)
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
/* 根據線性地址區間是否應該用於文件內存映射或匿名內存映射 */
if (file && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
/*
* 當不是用於文件內存映射或是匿名內存映射,
* 調用current->mm->get_unmapped_area.
* 即調用arch_get_unmapped_area或arch_get_unmapped_area_topdown
*/
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
if (addr > TASK_SIZE - len)
return -ENOMEM;
if (offset_in_page(addr))
return -EINVAL;
addr = arch_rebalance_pgtables(addr, len);
error = security_mmap_addr(addr);
return error ? error : addr;
}
EXPORT_SYMBOL(get_unmapped_area);
以arch_get_unmapped_area爲例。當addr非空,表示指定了一個特定的優先選用地址,內核會檢查該區域是否與現存區域重疊,由find_vma()完成查找功能。當addr爲空或是指定的優先地址不滿足分配條件時,內核必須遍歷進程中可用的區域,設法找到一個大小適當的空閒區域,有vm_unmapped_area()做實際的工作。
unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
/* MAP_FIXED : 表示映射將在固定地址創建 */
if (flags & MAP_FIXED)
return addr;
if (addr) {
addr = PAGE_ALIGN(addr);
/*
* find_vma() 尋找第一個滿足 addr < vm_area_struct->vm_end 的vma區
* vma = NULL 在vma紅黑樹的右子樹,addr 是所存在的所有線性區線性地址最大
* vma != NULL 一定是tmp == NULL (tmp在find_vma指向當前結點)跳出循環的
*/
vma = find_vma(mm, addr);
/*
* 以下分別判斷:
* 1: 請求分配的長度是否小於進程虛擬地址空間大小
* 2: 新分配的虛擬地址空間的起始地址是否在mmap_min_addr(允許分配虛擬地址空間的最低地址)之上
* 3: vma是否空
* 4: vma非空,新分配的虛擬地址空間,是否與相鄰的vma重合
*/
if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = TASK_SIZE;
info.align_mask = 0;
return vm_unmapped_area(&info);
}
/*
* Search for an unmapped address range.
*
* We are looking for a range that:
* - does not intersect with any VMA;
* - is contained within the [low_limit, high_limit) interval;
* - is at least the desired size.
* - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
*/
static inline unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
/* arch_get_unmapped_area是低地址到高地址創建映射 所以這時默認調用unmapped_area */
if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
return unmapped_area_topdown(info);
else
return unmapped_area(info);
}
在分析unmapped_area()之前,我認爲有必要搞清楚vm_area_struct結構體中rb_subtree_gap的含義。在http://patchwork.ozlabs.org/patch/197340/ 這樣解釋:
Define vma->rb_subtree_gap as the largest gap between any vma in the subtree rooted at that vma, and their predecessor. Or, for a recursive definition, vma->rb_subtree_gap is the max of:
- vma->vm_start - vma->vm_prev->vm_end
- rb_subtree_gap fields of the vmas pointed by vma->rb.rb_left and
vma->rb.rb_right
rb_subtree_gap是當前結點與其前驅結點之間空隙 和 當前結點其左右子樹中的結點間的最大空隙的最大值。
unmapped_area():先檢查進程虛擬地址空間中可用於映射空間的邊界,不滿足要求返回錯誤代號到上層應用程序。當滿足時,執行以下操作,爲了找到最小的空閒的虛擬地址空間滿足這次分配請求,便於兩個相鄰的vma區合併。
步驟如下:
1. 從vma紅黑樹的根開始遍歷
2. 若當前結點有左子樹則遍歷其左子樹,否則指向其右孩子。
3. 當某結點rb_subtree_gap可能是最後一個滿足分配請求的空隙時,遍歷結束。
4. 檢測這個結點,判斷這個結點與其前驅結點之間的空隙是否滿足分配請求。滿足則跳出循環。
5. 不滿足分配請求時,指向其右孩子,判斷其右孩子的rb_subtree_gap是否滿足當前請求。
6. 滿足則返回到2。不滿足,回退其父結點,返回到4
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
* We implement the search by looking for an rbtree node that
* immediately follows a suitable gap. That is,
* - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
* - gap_end = vma->vm_start >= info->low_limit + length;
* - gap_end - gap_start >= length
*/
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
return -ENOMEM;
high_limit = info->high_limit - length;
if (info->low_limit > high_limit)
return -ENOMEM;
low_limit = info->low_limit + length;
/* Check if rbtree root looks promising */
if (RB_EMPTY_ROOT(&mm->mm_rb))
goto check_highest;
vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
if (vma->rb_subtree_gap < length)
goto check_highest;
while (true) {
/* Visit left subtree if it looks promising */
/* 先從低地址開始查詢 */
gap_end = vma->vm_start;
if (gap_end >= low_limit && vma->vm_rb.rb_left) {
struct vm_area_struct *left =
rb_entry(vma->vm_rb.rb_left,struct vm_area_struct, vm_rb);
/*
* 查找到最後一個空隙可能滿足這次分配,
* 說明 addr 從低地址向高地址 分配 。
* 便於相鄰的兩個vma合併。
*/
if (left->rb_subtree_gap >= length) {
vma = left;
continue;
}
}
/* 當前結點的rb_subtree_gap 已經是最後一個可能滿足這次分配 */
gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
return -ENOMEM;
if (gap_end >= low_limit && gap_end - gap_start >= length)
goto found;
/* Visit right subtree if it looks promising */
/*
* 當前結點與其前驅的空隙也不能滿足這次請求,
* 檢測當前結點的右孩子的 rb_subtree_gap
*/
if (vma->vm_rb.rb_right) {
struct vm_area_struct *right =
rb_entry(vma->vm_rb.rb_right,
struct vm_area_struct, vm_rb);
/*
* 以右孩子爲根的樹中 rb_subtree_gap 來滿足這次的請求
* case 1:若滿足,又從當前結點的右結點的左子樹開始尋找
* case 2:若不滿足,說明當前結點 左右子樹沒有滿足這次請求的空隙,
* 所以回退到上個結點
*/
if (right->rb_subtree_gap >= length) {//case 1
vma = right;
continue;
}
}
/* Go back up the rbtree to find next candidate node */
while (true) {//case 2
struct rb_node *prev = &vma->vm_rb;
if (!rb_parent(prev))
goto check_highest;
vma = rb_entry(rb_parent(prev),
struct vm_area_struct, vm_rb);
// 當前結點的前驅只可能是其左孩子。因爲rb_subtree_gap是當前結點與其前驅的空隙
if (prev == vma->vm_rb.rb_left) {
gap_start = vma->vm_prev->vm_end;
gap_end = vma->vm_start;
goto check_current;
}
}
}
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
if (gap_start < info->low_limit)
gap_start = info->low_limit;
/* Adjust gap address to the desired alignment */
gap_start += (info->align_offset - gap_start) & info->align_mask;
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
return gap_start;
}
參考源碼:linux-4.4