进程空间的VMA结构体中由两个成员,红黑树以及链表,表示其组织方式由两种:
红黑树:
链表类型:
有了上面的图例,下面简单讲一下malloc的分配原理:
首先malloc是通过系统调用brk来完成内存分配的:
SYSCALL_DEFINE1(brk, unsigned long, brk)------------(1)
{
unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
bool populate;
down_write(&mm->mmap_sem);
min_brk = mm->end_data;
if (brk < min_brk)---------------(2)
goto out;
/*
* Check against rlimit here. If this check is done later after the test
* of oldbrk with newbrk then it can escape the test and let the data
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
mm->end_data, mm->start_data))-------------(3)
goto out;
newbrk = PAGE_ALIGN(brk);--------(4)
oldbrk = PAGE_ALIGN(mm->brk);
if (oldbrk == newbrk)
goto set_brk;
/* Always allow shrinking brk. */
if (brk <= mm->brk) {--------------(5)
if (!do_munmap(mm, newbrk, oldbrk-newbrk))
goto set_brk;
goto out;
}
/* Check against existing mmap mappings. */
if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))-------------(6)
goto out;
/* Ok, looks good - let it rip. */
if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)---------------(7)
goto out;
set_brk:
mm->brk = brk;
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
up_write(&mm->mmap_sem);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);---------(8)
return brk;
out:
retval = mm->brk;
up_write(&mm->mmap_sem);
return retval;
}
(1)参数brk表示heap新的边界线
(2)如果新的边界线低于data段,则说明分配出现问题,直接return原来的oldbrk。
(3)RLIMIT相关参数是否满足要求
(4)新旧brk都按照页面对齐,且不足一页部分采用进一法对齐
(5)新边界小于老边界,表示要释放内存,调用do_munmap()函数完成内存释放
(6)查看是否由已经分配好的vma包含当前需要分配的内存边界,如果由则直接return。
(7)调用主要功能函数do_brk来完成分配。
(8)如果flag带有VM_FLAG标志时需要立即为这块进程地址空间分配内存调用mm_populate。
do_brk:
static unsigned long do_brk(unsigned long addr, unsigned long len)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
unsigned long flags;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
len = PAGE_ALIGN(len);
if (!len)
return addr;
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);-------(1)
if (error & ~PAGE_MASK)
return error;
error = mlock_future_check(mm, mm->def_flags, len);
if (error)
return error;
/*
* mm->mmap_sem is required to protect against another thread
* changing the mappings in case we sleep.
*/
verify_mm_writelocked(mm);
/*
* Clear old maps. this also does some error checking for us
*/
munmap_back:
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {------(2)
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
}
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL, NULL);-----(3)
if (vma)
goto out;
/*
* create a vma struct for an anonymous mapping
*/
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);----------(4)
if (!vma) {
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
vma_link(mm, vma, prev, rb_link, rb_parent);------------(5)
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
return addr;
}
(1)检查是否由足够的空间来分配len大小的内存。
(2)通过find_vma_links函数来查找要插入VMA的合适位置,同时check分配的内存块是否由和当前的vma由重叠或者包含关系,如果有调用do_munmap来直接释放掉要分配的内存块。
(3)当前面已经找到了合适的插入点,则check此时的内存块和相邻的VMA是否可以合并。
(4)如果不能合并vma,则申请一个slab对象来存放vma结构体。
(5)初始化vma结构体,并将新的vma添加到红黑树上面已经相应的链表中
到这里malloc就结束了,从以上可看出malloc在完成调用后并没有立马分配物理内存,而是在需要在进程需要访问此虚拟内存块时才会产生缺页中断,才会分配物理内存,并建立虚拟内存到物理内存的映射。以上情况除了当分配flag带有VM_MLOCK时需要立即调用mm_populate来分配物理内存:
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
struct mm_struct *mm = current->mm;
unsigned long end, nstart, nend;
struct vm_area_struct *vma = NULL;
int locked = 0;
long ret = 0;
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
for (nstart = start; nstart < end; nstart = nend) {--------------(1)
/*
* We want to fault in pages for [nstart; end) address range.
* Find first corresponding VMA.
*/
if (!locked) {
locked = 1;
down_read(&mm->mmap_sem);
vma = find_vma(mm, nstart);-----------(2)
} else if (nstart >= vma->vm_end)
vma = vma->vm_next;
if (!vma || vma->vm_start >= end)
break;
/*
* Set [nstart; nend) to intersection of desired address
* range with the first VMA. Also, skip undesirable VMA types.
*/
nend = min(end, vma->vm_end);
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
continue;
if (nstart < vma->vm_start)
nstart = vma->vm_start;
/*
* Now fault in a range of pages. __mlock_vma_pages_range()
* double checks the vma flags, so that it won't mlock pages
* if the vma was already munlocked.
*/
ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);----------(3)
if (ret < 0) {
if (ignore_errors) {
ret = 0;
continue; /* continue at next VMA */
}
ret = __mlock_posix_error_return(ret);
break;
}
nend = nstart + ret * PAGE_SIZE;
ret = 0;
}
if (locked)
up_read(&mm->mmap_sem);
return ret; /* 0 or negative error code */
}
(1)循环主要是怕内存块的映射会涉及到多个VMA。
(2)调用find_vma()找到虚拟内存块对应的合适的VMA,find_vma()函数执行原理如下:
address A和B调用find_vma()函数都会找到VMA(n+1)
(3)调用__mlock_vma_pages_range()为vma分配物理内存。
__mlock_vma_pages_range():
long __mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *nonblocking)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE;--------(1)
int gup_flags;
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(end & ~PAGE_MASK);
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);-------------(2)
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
gup_flags = FOLL_TOUCH | FOLL_MLOCK;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
* and we would not want to dirty them for nothing.
*/
if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
gup_flags |= FOLL_WRITE;
/*
* We want mlock to succeed for regions that have any permissions
* other than PROT_NONE.
*/
if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
gup_flags |= FOLL_FORCE;
/*
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
return __get_user_pages(current, mm, start, nr_pages, gup_flags,
NULL, NULL, nonblocking);-----------------(3)
}
(1)分配物理内存的页数
(2)错误检查内存块必须包含在vma内存块内。
(3)分配内存实际起作用函数__get_user_pages()。
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
long i = 0;
unsigned int page_mask;
struct vm_area_struct *vma = NULL;
if (!nr_pages)
return 0;
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
* fault information is unrelated to the reference behaviour of a task
* using the address space
*/
if (!(gup_flags & FOLL_FORCE))
gup_flags |= FOLL_NUMA;
do {
struct page *page;
unsigned int foll_flags = gup_flags;
unsigned int page_increm;
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
vma = find_extend_vma(mm, start);--------------------(1)
if (!vma && in_gate_area(mm, start)) {
int ret;
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
if (ret)
return i ? : ret;
page_mask = 0;
goto next_page;
}
if (!vma || check_vma_flags(vma, gup_flags))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
gup_flags);
continue;
}
}
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (unlikely(fatal_signal_pending(current)))
return i ? i : -ERESTARTSYS;
cond_resched();
page = follow_page_mask(vma, start, foll_flags, &page_mask);-------(2)
if (!page) {
int ret;
ret = faultin_page(tsk, vma, start, &foll_flags,
nonblocking);----------(3)
switch (ret) {
case 0:
goto retry;
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
return i ? i : ret;
case -EBUSY:
return i;
case -ENOENT:
goto next_page;
}
BUG();
}
if (IS_ERR(page))
return i ? i : PTR_ERR(page);
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
page_mask = 0;
}
next_page:
if (vmas) {
vmas[i] = vma;
page_mask = 0;
}
page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
return i;
}
(1)再次查找vma,如果vm_start大于起始地址,则扩容vma。
(2)follow_page_mask()函数查看VMA中的虚拟页面是否已经分配了物理内存。返回已经normal mapping的页面的struct page
(3)如果第二步没有返回page,则调用faultin_page()函数进而调用mm_handle_fault来触发一个缺页中断,进而来分配内存
到这里malloc函数就讲完了。