匿名頁面:
do_anonymous_page():
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags)
{
………………
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),---------------(1)
vma->vm_page_prot));
goto setpte;
………………
page = alloc_zeroed_user_highpage_movable(vma, address);---------------(2)
lru_cache_add_active_or_unevictable(page, vma);------------------(3)
setpte:
set_pte_at(mm, address, page_table, entry);----------------------(4)
}
(1)如果分配的頁面具有隻讀屬性,則系統會分配一個全填充爲零的頁面,零頁面在系統初始化時候已經初始化好了
(2)如果不是隻讀,則正常調用alloc_zeroed_user_highpage_movable()最後調用alloc_page()分配一個頁面
(3)將匿名頁面添加到LRU鏈表中。
(4)調用set_pte_at()設置到硬件頁表中。
具體流程圖如下:
文件映射缺頁中斷:
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE))--------------------(1)
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
if (!(vma->vm_flags & VM_SHARED))----------------(2)
return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);-------------(3)
}
(1)如果需要獲取的頁面不具備可寫屬性則執行do_read_fault().
(2)如果需要獲取的頁面具有可寫屬性,但爲私有頁面,則執行do_cow_fault().
(3)其他情況,共享頁面則執行do_shared_fault().
do_read_fault():
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);-----------(1)
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);------------------(2)
…………
do_set_pte(vma, address, fault_page, pte, false, false);---------------(3)
}
(1)調用__do_fault()進而調用vm_ops.fault()函數來完成頁面的申請,vm_ops.fault函數主要有個模塊自己實現,例如IO的:
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
追蹤代碼最後發現申請頁面最後仍然使用的是alloc_pages()函數來實現。
(2)獲取當前頁表項pte
(3)將新生成的PTE entry設置到硬件頁表項中
do_cow_fault():
static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
new_page = alloc_page_vma(gfp, vma, address);------------(1)
…………
ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);------(2)
…………
if (fault_page)
copy_user_highpage(new_page, fault_page, address, vma);----------(3)
…………
pte = pte_offset_map_lock(mm, pmd, address, &ptl);------------(4)
…………
do_set_pte(vma, address, new_page, pte, true, true);----------(5)
lru_cache_add_active_or_unevictable(new_page, vma);------------(6)
…………
}
(1)申請一個新的頁面。
(2)使用__do_fault通過vma->vm_ops->fault()將文件內容讀取到fault_page頁面。
(3)如果fault_page存在,則將fault_page的內容複製到new_page中。
(4)重新獲取異常地址對應的頁表項。
(5)將new_page對應的PTE entry設置到硬件頁表裏面.
(6)將new_page頁面添加到對應的LRU鏈表。
do_shared_page():
static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);------------(1)
if (vma->vm_ops->page_mkwrite) {
…………
tmp = do_page_mkwrite(vma, fault_page, address);------------(2)
…………
}
pte = pte_offset_map_lock(mm, pmd, address, &ptl);--------------(3)
…………
do_set_pte(vma, address, fault_page, pte, true, false);----------(4)
if (set_page_dirty(fault_page))---------(5)
dirtied = 1;
…………
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {------------(6)
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
…………
}
(1)讀取文件到fault_page中
(2)使頁面變爲可寫頁面(與do_read_page()函數不同之處)
(3)獲取fault_page對應的pte
(4)將新生成的PTE entry設置到硬件頁表中
(5)將page標記爲dirty(與do_read_page()函數不同之處)
(6)通過balance_dirty_pages_ratelimited()來平衡並回寫一部分髒頁。
寫時複製:
do_wp_page():
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte, unsigned int flags)
__releases(ptl)
{
…………
old_page = vm_normal_page(vma, address, orig_pte);--------------(1)
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
return wp_pfn_shared(mm, vma, address, page_table, ptl,
orig_pte, pmd);-----------(2)
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
orig_pte, old_page, gfp);---------(3)
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
if (PageAnon(old_page) && !PageKsm(old_page)) {----------(4)
if (!trylock_page(old_page)) {
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
page_cache_release(old_page);
return 0;
}
page_cache_release(old_page);
}
if (reuse_swap_page(old_page)) {-------------(5)
/*
* The page is all ours. Move it to our anon_vma so
* the rmap code will not search our parent or siblings.
* Protected against the rmap code by the page lock.
*/
page_move_anon_rmap(old_page, vma, address);
unlock_page(old_page);
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, 0, 0);----------------(6)
}
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(mm, vma, address, page_table, pmd,
ptl, orig_pte, old_page);--------------(7)
}
/*
* Ok, we need to copy. Oh, well..
*/
page_cache_get(old_page);-----------(7)
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
orig_pte, old_page, gfp);-----------------(8)
}
(1)獲取一個normal_mapping的頁面
(2)處理special mapping的情況,如果vma是可寫且共享,則調用wp_pfn_shared(),繼續使用這個頁面,不做寫時複製操作
(3)否則調用wp_page_copy()重新分配一個頁面進行寫時複製。
(4)處理Anon page且不是KSM的情況,主要是加鎖以及增加page引用計數。
(5)通過reuse_swap_page()函數判斷page的count值是否爲1,判斷頁面是否爲只有一個進程映射的匿名頁面,如果是則繼續使用此頁面,不做寫時複製操作。
(6)如果是page cache或者KSM頁面,然後繼續使用此頁面不做寫時複製
(7)增加page->count計數
(8)此時需要寫時複製,調用wp_page_copy()完成操作。
wp_pfn_shared():
static int wp_pfn_shared(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
pmd_t *pmd)
{
…………
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
…………
ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);---------(1)
…………
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);------------------(2)
…………
return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
NULL, 0, 0);------------(3)
}
static inline int wp_page_reuse(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
struct page *page, int page_mkwrite,
int dirty_shared)
__releases(ptl)
{
…………
flush_cache_page(vma, address, pte_pfn(orig_pte));-----------(4)
entry = pte_mkyoung(orig_pte);-------------(4)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);-------------(5)
if (dirty_shared) {-------(6)
…………
dirtied = set_page_dirty(page);-----------(7)
…………
if ((dirtied || page_mkwrite) && mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);---------(8)
}
}
…………
}
(1)通知之前的只讀頁面變成了可寫屬性
(2)獲取頁面對用的額PTE entry
(3)調用wp_page_reuse進一步設置頁面相關屬性
(4)刷新頁面對應cache
(5)設置pte訪問位
(6)根據pte的標誌位設置頁面的可寫屬性,同時設置pte的dirty位
(7)如果是dirty_share頁面,則設置頁面的dirty位
(8)平衡頁面並回寫一部分髒頁
以上只是根據字面意思來理解,不是很懂,大體上和缺頁中斷髮生非寫時複製操作類似。
開始寫時複製的核心函數wp_page_copy():
static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
pte_t orig_pte, struct page *old_page, gfp_t gfp)
{
…………
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage(gfp, vma, address);--------------(1)
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(gfp, vma, address);-----------(2)
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);----------------(3)
}
…………
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);-----------(4)
if (likely(pte_same(*page_table, orig_pte))) {
…………
flush_cache_page(vma, address, pte_pfn(orig_pte));-----------(5)
entry = mk_pte(new_page, vma->vm_page_prot);---------------(6)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);---------------(7)
…………
page_add_new_anon_rmap(new_page, vma, address);------------(8)
lru_cache_add_active_or_unevictable(new_page, vma);---------------(9)
}
set_pte_at_notify(mm, address, page_table, entry);-------------(10)
update_mmu_cache(vma, address, page_table);-------------------(11)
}
(1)判斷old page是否爲零頁面,如果是則alloc_zeroed_user_highpage()分配一個全是零的頁面
(2)不是0頁面則分配一個新頁面new_page
(3)將就頁面的內容複製到新頁面。
(4)重新獲取pte,並判斷pte是否被修改過
(5)利用新頁面和VMA屬性重新生成一個PTE entry
(6)刷新page對應的cache
(7)設置PTE entry的DIRTY和WIRTABLE位
(8)把new_page添加到RMAP反向映射
(9)將new_page添加到對應的LRU鏈表
(10)將新生成的PTE entry設置到硬件頁表
(11)更新MMU cache。
總結一下,以上步驟基本上和file映射裏面發生寫時複製的操作大同小異。
具體流程圖如下: