函數原型
void *mmap(void *addr, size_t length, int prot, int flags,
int fd, off_t offset);
虛擬內存管理
從內核的角度來看,虛擬空間的管理是以進程爲基礎的,每個進程都有自己的虛存空間,除此之外,每個進程的內核空間是所有進程共享的。一個進程的虛擬空間由兩個數據結構來描述mm_struct和vm_area_struct。
mm_struct包括進程中虛擬地址空間的所有信息
struct mm_struct {
struct {
struct vm_area_struct *mmap; /* vm_area_struct的鏈表 */
pgd_t * pgd; /* 指向進程的頁目錄 */
/* ... */
int map_count; /* vm_area_struct數量 */
/* ... */
unsigned long total_vm; /* 映射的Page數量 */
/* ... */
unsigned long start_code, end_code, start_data, end_data; /* 代碼段起始結束位置,數據段起始結束位置 */
unsigned long start_brk, brk, start_stack; /* 堆的起始結束位置, 棧因爲其性質,只有起始位置 */
unsigned long arg_start, arg_end, env_start, env_end; /* 參數段,環境段的起始結束位置 */
/* ... */
}
}
vm_area_struct描述了虛擬地址空間的一個區間,一個進程的虛擬空間可能有多個虛擬區間
下圖是某個進程的虛擬內存簡化佈局以及相應的幾個數據結構之間的關係:
mmap的映射原理
1.檢查參數,並根據傳入的映射類型設置vma的flags
2.進程查找其虛擬地址空間,找到一塊空閒的滿足要求的虛擬地址空間
3.根據找到的虛擬地址空間初始化vma
4.設置vma->vm_file
5.根據文件系統類型,將vma->vm_ops
設爲對應的file_operations
6.將vma
插入mm
的鏈表中
源碼分析
do_mmap()是整個mmap()
的具體操作函數
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, vm_flags_t vm_flags,
unsigned long pgoff, unsigned long *populate,
struct list_head *uf)
{
struct mm_struct *mm = current->mm; /* 獲取該進程的memory descriptor
int pkey = 0;
*populate = 0;
/*
函數對傳入的參數進行一系列檢查, 假如任一參數出錯,都會返回一個errno
*/
if (!len)
return -EINVAL;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
/* force arch specific MAP_FIXED handling in get_unmapped_area */
if (flags & MAP_FIXED_NOREPLACE)
flags |= MAP_FIXED;
/* 假如沒有設置MAP_FIXED標誌,且addr小於mmap_min_addr, 因爲可以修改addr, 所以就需要將addr設爲mmap_min_addr的頁對齊後的地址 */
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* Careful about overflows.. */
/* 進行Page大小的對齊 */
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
/* Too many mappings? */
/* 判斷該進程的地址空間的虛擬區間數量是否超過了限制 */
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
/* get_unmapped_area從當前進程的用戶空間獲取一個未被映射區間的起始地址 */
addr = get_unmapped_area(file, addr, len, pgoff, flags);
/* 檢查addr是否有效 */
if (offset_in_page(addr))
return addr;
/* 假如flags設置MAP_FIXED_NOREPLACE,需要對進程的地址空間進行addr的檢查. 如果搜索發現存在重合的vma, 返回-EEXIST。
這是MAP_FIXED_NOREPLACE標誌所要求的
*/
if (flags & MAP_FIXED_NOREPLACE) {
struct vm_area_struct *vma = find_vma(mm, addr);
if (vma && vma->vm_start < addr + len)
return -EEXIST;
}
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(mm);
if (pkey < 0)
pkey = 0;
}
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* 假如flags設置MAP_LOCKED,即類似於mlock()將申請的地址空間鎖定在內存中, 檢查是否可以進行lock*/
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
if (mlock_future_check(mm, vm_flags, len))
return -EAGAIN;
if (file) { /* file指針不爲nullptr, 即從文件到虛擬空間的映射 */
struct inode *inode = file_inode(file); /* 獲取文件的inode */
unsigned long flags_mask;
if (!file_mmap_ok(file, inode, pgoff, len))
return -EOVERFLOW;
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
/*
...
根據標誌指定的map種類,把爲文件設置的訪問權考慮進去。
如果所請求的內存映射是共享可寫的,就要檢查要映射的文件是爲寫入而打開的,而不
是以追加模式打開的,還要檢查文件上沒有上強制鎖。
對於任何種類的內存映射,都要檢查文件是否爲讀操作而打開的。
...
*/
} else {
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
/*
* Ignore pgoff.
*/
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE:
/*
* Set pgoff according to addr for anon_vma.
*/
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/
if (flags & MAP_NORESERVE) {
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
do_mmap()
根據用戶傳入的參數做了一系列的檢查,然後根據參數初始化vm_area_struct
的標誌vm_flags
,vma->vm_file = get_file(file)
建立文件與vma
的映射:
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm; // 獲取該進程的memory descriptor
struct vm_area_struct *vma, *prev;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
nr_pages = count_vma_pages_range(mm, addr, addr + len);
if (!may_expand_vm(mm, vm_flags,
(len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
/* 檢查[addr, addr+len)的區間是否存在映射空間,假如存在重合的映射空間需要munmap */
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
if (do_munmap(mm, addr, len, uf))
return -ENOMEM;
}
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
/* 檢查是否可以合併[addr, addr+len)區間內的虛擬地址空間vma*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
if (vma) /* 假如合併成功,即使用合併後的vma, 並跳轉至out */
goto out;
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
/* 通過Memory Descriptor來申請一個vma */
vma = vm_area_alloc(mm);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
/* 初始化vma */
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
if (file) { /* 假如指定了文件映射 */
if (vm_flags & VM_DENYWRITE) { /* 映射的文件不允許寫入,調用deny_write_accsess(file)排斥常規的文件操作 */
error = deny_write_access(file);
if (error)
goto free_vma;
}
if (vm_flags & VM_SHARED) { /* 映射的文件允許其他進程可見, 標記文件爲可寫 */
error = mapping_map_writable(file->f_mapping);
if (error)
goto allow_write_and_free_vma;
}
/* ->mmap() can change vma->vm_file, but must guarantee that
* vma_link() below can deny write-access if VM_DENYWRITE is set
* and map writably if VM_SHARED is set. This usually means the
* new file must not have been exposed to user-space, yet.
*/
vma->vm_file = get_file(file); /* 遞增File的引用次數,返回File賦給vma*/
error = call_mmap(file, vma); /* 調用文件系統指定的mmap函數,後面會介紹 */
if (error)
goto unmap_and_free_vma;
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
/* 假如標誌爲VM_SHARED,但沒有指定映射文件,需要調用shmem_zero_setup()
shmem_zero_setup()實際映射的文件是dev/zero
*/
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
} else {
/* 既沒有指定file, 也沒有設置VM_SHARED, 即設置爲匿名映射 */
vma_set_anonymous(vma);
}
/* 將申請的新vma加入mm中的vma鏈表*/
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
}
file = vma->vm_file;
out:
perf_event_mmap(vma);
/* 更新進程的虛擬地址空間mm */
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
if (file)
uprobe_mmap(vma);
/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;
vma_set_page_prot(vma);
return addr;
unmap_and_free_vma:
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
free_vma:
vm_area_free(vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}
mmap_region()
調用了call_mmap(file, vma)
: call_mmap
根據文件系統的類型選擇適配的mmap()
函數,我們選擇目前常用的ext4
:
ext4_file_mmap()
是ext4
對應的mmap
, 功能非常簡單,更新了file的修改時間(file_accessed(flie))
,將對應的operation賦給vma->vm_flags
:
三個操作函數的意義:
.fault
: 處理Page Fault.map_pages
: 映射文件至Page Cache.page_mkwrite
: 修改文件的狀態爲可寫
通過分析mmap
的源碼我們發現在調用mmap()
的時候僅僅申請一個vm_area_struct
來建立文件與虛擬內存的映射,並沒有建立虛擬內存與物理內存的映射。Linux並不在調用mmap()
時就爲進程分配物理內存空間,直到下次真正訪問地址空間時發現數據不存在於物理內存空間時,觸發Page Fault
即缺頁中斷,Linux纔會將缺失的Page換入內存空間. 後面的文章我們會介紹Linux的缺頁(Page fault)處理和請求Page的機制