用戶空間缺頁異常pte_handle_fault()分析--(下)--寫時複製

在pte_handle_fault()中，如果觸發異常的頁存在於主存中，那麼該異常往往是由寫了一個只讀頁觸發的，此時需要進行COW(寫時複製操作)。如當一個父進程通過fork()創建了一個子進程時，子進程將會共享父進程的頁框。之後，無論是父進程還是子進程要對相應的內存進行寫操作，都要進行COW，也就是爲自己重新分配一個頁框，並把之前的數據複製到頁框中去，再寫。

static inline int handle_pte_fault(struct mm_struct *mm,
		struct vm_area_struct *vma, unsigned long address,
		pte_t *pte, pmd_t *pmd, unsigned int flags)
{
	pte_t entry;
	spinlock_t *ptl;

	entry = *pte;

	...
	...
	...
	/********頁在主存中的情況***********/
	
	ptl = pte_lockptr(mm, pmd);
	spin_lock(ptl);
	if (unlikely(!pte_same(*pte, entry)))
		goto unlock;
	if (flags & FAULT_FLAG_WRITE) {//異常由寫訪問觸發
		if (!pte_write(entry))//而對應的頁是不可寫的
			return do_wp_page(mm, vma, address, //此時必須進行寫時複製的操作
					pte, pmd, ptl, entry);
		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
		update_mmu_cache(vma, address, entry);
	} else {
		/*
		 * This is needed only for protection faults but the arch code
		 * is not yet telling us if this is a protection fault or not.
		 * This still avoids useless tlb flushes for .text page faults
		 * with threads.
		 */
		if (flags & FAULT_FLAG_WRITE)
			flush_tlb_page(vma, address);
	}
unlock:
	pte_unmap_unlock(pte, ptl);
	return 0;
}

可以看到，hand_pte_fault()函數處理頁存在於主存中的情況的關鍵操作都集中在do_wp_page()函數上。該函數是用來處理COW的，不過在COW之前先要做一些檢查，比如說，如果對應的頁只有一個進程使用，那麼便可以直接修改頁的權限爲可讀可寫，而不進行COW。總之，不到不得以的情況下是不會進行COW的。

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pte_t *page_table, pmd_t *pmd,
		spinlock_t *ptl, pte_t orig_pte)
{
	struct page *old_page, *new_page;
	pte_t entry;
	int reuse = 0, ret = 0;
	int page_mkwrite = 0;
	struct page *dirty_page = NULL;

	old_page = vm_normal_page(vma, address, orig_pte);//獲取共享頁
	if (!old_page) {//獲取共享頁失敗
		/*
		 * VM_MIXEDMAP !pfn_valid() case
		 *
		 * We should not cow pages in a shared writeable mapping.
		 * Just mark the pages writable as we can't do any dirty
		 * accounting on raw pfn maps.
		 */
		 /*如果vma的映射本來就是共享且可寫的，則跳轉至reuse直接使用orig_pte對應的頁*/
		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
				     (VM_WRITE|VM_SHARED))
			goto reuse;
		/*否則跳轉至gotten分配一個頁*/
		goto gotten;
	}

	/*
	 * Take out anonymous pages first, anonymous shared vmas are
	 * not dirty accountable.
	 */
	 /*下面首先判斷匿名頁的情況，如果old_page是匿名頁，並且只有一個進程使用它(reuse爲1)，則
	    則直接使用該頁*/
	if (PageAnon(old_page) && !PageKsm(old_page)) {
		/*這裏先判斷是否有其他進程競爭，修改了頁表*/
		if (!trylock_page(old_page)) {
			page_cache_get(old_page);
			pte_unmap_unlock(page_table, ptl);
			lock_page(old_page);
			page_table = pte_offset_map_lock(mm, pmd, address,
							 &ptl);
			if (!pte_same(*page_table, orig_pte)) {
				unlock_page(old_page);
				page_cache_release(old_page);
				goto unlock;
			}
			page_cache_release(old_page);
		}
		/*確定沒有其他進程競爭，則進行reuse判斷，通過reuse_swap_page()函數判斷
		 old_page的_mapcount字段是否爲0，是的話則表明只有一個進程使用該匿名頁*/
		reuse = reuse_swap_page(old_page);
		unlock_page(old_page);
	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
					(VM_WRITE|VM_SHARED))) {//如果vma的映射本來就是共享且可寫的
		/*
		 * Only catch write-faults on shared writable pages,
		 * read-only shared pages can get COWed by
		 * get_user_pages(.write=1, .force=1).
		 */
		if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
			struct vm_fault vmf;
			int tmp;

			vmf.virtual_address = (void __user *)(address &
								PAGE_MASK);
			vmf.pgoff = old_page->index;
			vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
			vmf.page = old_page;

			/*
			 * Notify the address space that the page is about to
			 * become writable so that it can prohibit this or wait
			 * for the page to get into an appropriate state.
			 *
			 * We do this without the lock held, so that it can
			 * sleep if it needs to.
			 */
			page_cache_get(old_page);//增加old_page的引用計數作爲保護
			pte_unmap_unlock(page_table, ptl);

			/*這裏通知即將修改頁的權限*/
			tmp = vma->vm_ops->page_mkwrite(vma, &vmf);

			/*如果無法修改的話，則跳轉到unwritable_page*/
			if (unlikely(tmp &
					(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
				ret = tmp;
				goto unwritable_page;
			}
			if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
				lock_page(old_page);
				if (!old_page->mapping) {
					ret = 0; /* retry the fault */
					unlock_page(old_page);
					goto unwritable_page;
				}
			} else
				VM_BUG_ON(!PageLocked(old_page));

			/*
			 * Since we dropped the lock we need to revalidate
			 * the PTE as someone else may have changed it.  If
			 * they did, we just return, as we can count on the
			 * MMU to tell us if they didn't also make it writable.
			 */
			 /*走到這裏表示已經成功修改了頁的權限了，這裏同樣重新獲取頁表，判斷是否和之前一致*/
			page_table = pte_offset_map_lock(mm, pmd, address,
							 &ptl);
			if (!pte_same(*page_table, orig_pte)) {
				unlock_page(old_page);
				page_cache_release(old_page);
				goto unlock;
			}

			page_mkwrite = 1;
		}
		dirty_page = old_page;
		get_page(dirty_page);
		reuse = 1;
	}

	if (reuse) {//reuse處理，也就是說不進行COW，可以直接在old_page上進行寫操作
reuse:
		flush_cache_page(vma, address, pte_pfn(orig_pte));
		entry = pte_mkyoung(orig_pte);//標記_PAGE_ACCESSED位
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);//將頁的權限修改爲可讀可寫，並且標記爲髒頁
		if (ptep_set_access_flags(vma, address, page_table, entry,1))
			update_mmu_cache(vma, address, entry);
		ret |= VM_FAULT_WRITE;
		goto unlock;
	}

	/*
	 * Ok, we need to copy. Oh, well..
	 */
	 /***************終於走到了不得已的一步了，下面只好進行COW了********************/
	page_cache_get(old_page);
gotten:
	pte_unmap_unlock(page_table, ptl);

	if (unlikely(anon_vma_prepare(vma)))
		goto oom;

	if (is_zero_pfn(pte_pfn(orig_pte))) {
		new_page = alloc_zeroed_user_highpage_movable(vma, address);//分配一個零頁面
		if (!new_page)
			goto oom;
	} else {
		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);//分配一個非零頁面
		if (!new_page)
			goto oom;
		cow_user_page(new_page, old_page, address, vma);//將old_page中的數據拷貝到new_page
	}
	__SetPageUptodate(new_page);

	/*
	 * Don't let another task, with possibly unlocked vma,
	 * keep the mlocked page.
	 */
	if ((vma->vm_flags & VM_LOCKED) && old_page) {
		lock_page(old_page);	/* for LRU manipulation */
		clear_page_mlock(old_page);
		unlock_page(old_page);
	}

	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
		goto oom_free_new;

	/*
	 * Re-check the pte - we dropped the lock
	 */
	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
	if (likely(pte_same(*page_table, orig_pte))) {
		if (old_page) {
			if (!PageAnon(old_page)) {
				dec_mm_counter(mm, file_rss);
				inc_mm_counter(mm, anon_rss);
			}
		} else
			inc_mm_counter(mm, anon_rss);
		flush_cache_page(vma, address, pte_pfn(orig_pte));
		entry = mk_pte(new_page, vma->vm_page_prot);//獲取new_page的pte
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);//修改new_page的權限
		/*
		 * Clear the pte entry and flush it first, before updating the
		 * pte with the new entry. This will avoid a race condition
		 * seen in the presence of one thread doing SMC and another
		 * thread doing COW.
		 */
		ptep_clear_flush(vma, address, page_table);
		page_add_new_anon_rmap(new_page, vma, address);
		/*
		 * We call the notify macro here because, when using secondary
		 * mmu page tables (such as kvm shadow page tables), we want the
		 * new page to be mapped directly into the secondary page table.
		 */
		set_pte_at_notify(mm, address, page_table, entry);
		update_mmu_cache(vma, address, entry);
		if (old_page) {
			/*
			 * Only after switching the pte to the new page may
			 * we remove the mapcount here. Otherwise another
			 * process may come and find the rmap count decremented
			 * before the pte is switched to the new page, and
			 * "reuse" the old page writing into it while our pte
			 * here still points into it and can be read by other
			 * threads.
			 *
			 * The critical issue is to order this
			 * page_remove_rmap with the ptp_clear_flush above.
			 * Those stores are ordered by (if nothing else,)
			 * the barrier present in the atomic_add_negative
			 * in page_remove_rmap.
			 *
			 * Then the TLB flush in ptep_clear_flush ensures that
			 * no process can access the old page before the
			 * decremented mapcount is visible. And the old page
			 * cannot be reused until after the decremented
			 * mapcount is visible. So transitively, TLBs to
			 * old page will be flushed before it can be reused.
			 */
			page_remove_rmap(old_page);
		}

		/* Free the old page.. */
		new_page = old_page;
		ret |= VM_FAULT_WRITE;
	} else
		mem_cgroup_uncharge_page(new_page);

	if (new_page)
		page_cache_release(new_page);
	if (old_page)
		page_cache_release(old_page);
unlock:
	pte_unmap_unlock(page_table, ptl);
	if (dirty_page) {
		/*
		 * Yes, Virginia, this is actually required to prevent a race
		 * with clear_page_dirty_for_io() from clearing the page dirty
		 * bit after it clear all dirty ptes, but before a racing
		 * do_wp_page installs a dirty pte.
		 *
		 * do_no_page is protected similarly.
		 */
		if (!page_mkwrite) {
			wait_on_page_locked(dirty_page);
			set_page_dirty_balance(dirty_page, page_mkwrite);
		}
		put_page(dirty_page);
		if (page_mkwrite) {
			struct address_space *mapping = dirty_page->mapping;

			set_page_dirty(dirty_page);
			unlock_page(dirty_page);
			page_cache_release(dirty_page);
			if (mapping)	{
				/*
				 * Some device drivers do not set page.mapping
				 * but still dirty their pages
				 */
				balance_dirty_pages_ratelimited(mapping);
			}
		}

		/* file_update_time outside page_lock */
		if (vma->vm_file)
			file_update_time(vma->vm_file);
	}
	return ret;
oom_free_new:
	page_cache_release(new_page);
oom:
	if (old_page) {
		if (page_mkwrite) {
			unlock_page(old_page);
			page_cache_release(old_page);
		}
		page_cache_release(old_page);
	}
	return VM_FAULT_OOM;

unwritable_page:
	page_cache_release(old_page);
	return ret;
}

用戶空間缺頁異常pte_handle_fault()分析--(下)--寫時複製

Linux Slub分配器(四)--分配對象

第一個Linux驅動-流水燈

Linux Slab分配器(六)--創建slab和銷燬slab

Linux Slub分配器(五)--釋放對象

Linux Slub分配器(二)--初始化

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結