前面簡單的分析了內核處理用戶空間缺頁異常的流程,進入到了handle_mm_fault()函數,該函數爲觸發缺頁異常的地址address分配各級的頁目錄,也就是說現在已經擁有了一個和address配對的pte了,但是這個pte如何去映射物理頁框,內核又得根據pte的狀態進行分類和判斷,而這個過程又會牽扯出一些其他的概念……這也是初讀linux內核源碼的最大障礙吧,在一些複雜的處理中,一個點往往可以延伸出一個面,容易讓人迷失方向……因此後面打算分幾次將這個函數分析完,自己也沒有完全理解透,所以不到位的地方歡迎大家指出,一起交流~
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
entry = *pte;
if (!pte_present(entry)) {//如果頁不在主存中
if (pte_none(entry)) {//頁表項內容爲0,表明進程未訪問過該頁
/*如果vm_ops字段和fault字段都不爲空,則說明這是一個基於文件的映射*/
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
pte, pmd, flags, entry);
}
/*否則分配匿名頁*/
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
/*屬於非線性文件映射且已被換出*/
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
pte, pmd, flags, entry);
/*頁不在主存中,但是頁表項保存了相關信息,則表明該頁被內核換出,則要進行換入操作*/
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
...
...
}
首先要確定的一點就是pte對應的頁是否駐留在主存中,因爲pte有可能之前映射了頁,但是該頁被換出了。上面的代碼給出了pte對應的頁沒有駐留在主存中的情況。如果pte對應的頁沒有駐留在主存中,且沒有映射任何頁,即pte_present()返回0,pte_none()返回0,則要判斷要分配一個匿名頁還是一個映射頁。在Linux虛擬內存中,如果頁對應的vma映射的是文件,則稱爲映射頁,如果不是映射的文件,則稱爲匿名頁。兩者最大的區別體現在頁和vma的組織上,因爲在頁框回收處理時要通過頁來逆向搜索映射了該頁的vma。對於匿名頁的逆映射,vma都是通過vma結構體中的vma_anon_node(鏈表節點)和anon_vma(鏈表頭)組織起來,再把該鏈表頭的信息保存在頁描述符中;而映射頁和vma的組織是通過vma中的優先樹節點和頁描述符中的mapping->i_mmap優先樹樹根進行組織的,具體可以參看ULK3。
來看基於文件的映射的處理:
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);//如果page_table之前用來建立了臨時內核映射,則釋放該映射
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
關鍵函數__do_fault():
static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
pte_t *page_table;
spinlock_t *ptl;
struct page *page;
pte_t entry;
int anon = 0;
int charged = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
int page_mkwrite = 0;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);//調用定義好的fault函數,確保將所需的文件數據讀入到映射頁
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
return VM_FAULT_HWPOISON;
}
/*
* For consistency in subsequent calls, make the faulted page always
* locked.
*/
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
VM_BUG_ON(!PageLocked(vmf.page));
/*
* Should we do an early C-O-W break?
*/
page = vmf.page;
if (flags & FAULT_FLAG_WRITE) {//寫訪問
if (!(vma->vm_flags & VM_SHARED)) {//私有映射,則要創建一個副本進行寫時複製
anon = 1;// 標記爲一個匿名映射
if (unlikely(anon_vma_prepare(vma))) {//創建一個anon_vma實例給vma
ret = VM_FAULT_OOM;
goto out;
}
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,//分配一個頁
vma, address);
if (!page) {
ret = VM_FAULT_OOM;
goto out;
}
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
page_cache_release(page);
goto out;
}
charged = 1;
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if (vma->vm_flags & VM_LOCKED)
clear_page_mlock(vmf.page);
/*創建數據的副本,將數據拷貝到新分配的頁*/
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
/*
* If the page will be shareable, see if the backing
* address space wants to know that the page is about
* to become writable
*/
if (vma->vm_ops->page_mkwrite) {
int tmp;
unlock_page(page);
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(page);
if (!page->mapping) {
ret = 0; /* retry the fault */
unlock_page(page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(page));
page_mkwrite = 1;
}
}
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if FAULT_FLAG_WRITE is set, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
if (likely(pte_same(*page_table, orig_pte))) {//確定沒有競爭,也就是頁表項中的內容和之前是一樣的
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);//頁表項指向對應的物理頁
/*如果是寫操作,則將頁的訪問權限置爲RW*/
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*如果之前生成的頁是匿名的,則將其集成到逆向映射當中*/
if (anon) {
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);//建立匿名頁與第一個vma的逆向映射
} else {
inc_mm_counter(mm, file_rss);
page_add_file_rmap(page);//建立頁與vma的普通映射
if (flags & FAULT_FLAG_WRITE) {
dirty_page = page;
get_page(dirty_page);
}
}
set_pte_at(mm, address, page_table, entry);//修改page_table使其指向entry對應的頁框
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
} else {
if (charged)
mem_cgroup_uncharge_page(page);
if (anon)
page_cache_release(page);
else
anon = 1; /* no anon but release faulted_page */
}
pte_unmap_unlock(page_table, ptl);
out:
if (dirty_page) {
struct address_space *mapping = page->mapping;
if (set_page_dirty(dirty_page))
page_mkwrite = 1;
unlock_page(dirty_page);
put_page(dirty_page);
if (page_mkwrite && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
} else {
unlock_page(vmf.page);
if (anon)
page_cache_release(vmf.page);
}
return ret;
unwritable_page:
page_cache_release(page);
return ret;
}
首先要做的就是調用vma->vm_ops中定義好的fault()函數,將所需的數據從文件讀入到映射頁中,該函數還會將vma插入到映射頁的mapping->i_mmap優先樹中。
文件一般以共享的方式進行映射,接下來就要判斷觸發異常的操作是否包含寫操作,如果是寫操作並且該vma不是以共享的方式映射該頁,則要進行寫時複製,也就是創建一個新的頁來供該vma讀寫,此時會申請一個匿名頁,並將數據拷貝到該匿名頁中。
接下來就要計算出page對應的pte值是多少,並將page_table指向的pte以該值進行填充,這樣就完成了頁表項到物理頁的映射
再來看分配匿名頁的處理
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags)
{
struct page *page;
spinlock_t *ptl;
pte_t entry;
pte_unmap(page_table);
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
return VM_FAULT_SIGBUS;
/* Use the zero-page for reads */
/*如果是讀操作,那麼就讓entry指向一個已有的填充爲0的現有頁,因爲進程是第一次訪問該頁,
所以頁中的內容是什麼並不重要,這樣進一步推遲了新頁的分配*/
if (!(flags & FAULT_FLAG_WRITE)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
goto setpte;
}
/*如果是寫操作,則要分配一個新的頁*/
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))//分配一個anon_vma實例
goto oom;
/*分配一個被0填充的頁*/
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
__SetPageUptodate(page);
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
goto oom_free_page;
/*獲取頁對應的PTE內容*/
entry = mk_pte(page, vma->vm_page_prot);
/*如果是寫操作則將頁的權限設爲讀寫並設置爲髒頁*/
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);//建立線性區和匿名頁的反向映射
setpte:
set_pte_at(mm, address, page_table, entry);//設置page_table對應的pte
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, entry);//更新MMU緩存
unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
mem_cgroup_uncharge_page(page);
page_cache_release(page);
goto unlock;
oom_free_page:
page_cache_release(page);
oom:
return VM_FAULT_OOM;
}
匿名頁分配的工作和__do_fault()中分配匿名頁差不多,只不過前面多了一個讀寫的判斷,如果是讀的話,不會分配匿名頁,而是讓pte指向一個被0填充的頁,這樣就進一步推遲了頁的分配。也許你會覺得奇怪,既然要讀數據怎麼可以分配一個事先準備好的全0的頁,其實仔細想想就會明白,缺頁異常處理進行到這裏,一定是第一次訪問相應的內存時纔會觸發,匿名頁對應的一般都是堆,棧這些區域,對這些區域的訪問一定先是寫而不是讀,所以對於這種操作本身就不正常,分配一個被0填充的頁使用戶進程讀出來的都是0也許會更安全一些。
如果不是這兩種情況的話,也就是說pte_none()返回的是0,那就說明pte之前映射過頁,只是該頁已被換出
如果該頁之前是用來進行非線性文件映射的話,其處理的主體函數就是上面介紹過的__do_fault()
static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff;
flags |= FAULT_FLAG_NONLINEAR;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
return 0;
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {//確保vma具有非線性映射屬性
/*
* Page table corrupted: show pte and kill process.
*/
print_bad_pte(vma, address, orig_pte, NULL);
return VM_FAULT_SIGBUS;
}
pgoff = pte_to_pgoff(orig_pte);//獲取映射的文件偏移
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
pte_to_pgoff()這個函數是和pgoff_to_pte()相對的一組操作。在非線性文件映射的頁被換出時,其映射文件的偏移會以PAGE_SIZE爲單位進行編碼,存儲到其pte中,所以當要重新換入該頁時,要進行相應的解碼計算出pgoff,再由__do_fault()進行處理!
對於頁沒有駐留在主存的情況中的最後一種處理方式,do_swap_page(),留在下次再做分析!