源碼之linux進程:vm_area_struct與虛擬內存的關係

前言

虛擬內存中,我提到了linux虛擬內存區域的結構,但具體其是如何在linux中表示與實現的呢?

我利用了linux2.6的源碼進行了淺顯的分析。

正文

task_struct

在linux中,進程控制塊即PCB的結構爲task_struct,我們以linux2.6爲例,其源碼如下:

struct task_struct {
    //表示進程當前運行狀態
    //volatile避免了讀取到緩存在寄存器中的髒數據,而是直接從內存中取
    //可以看到state基本有三種,但大於0還分爲很多不同的狀態
    //#define TASK_RUNNING		0
    //#define TASK_INTERRUPTIBLE	1
    //#define TASK_UNINTERRUPTIBLE	2
    //#define TASK_STOPPED		4
    //#define TASK_TRACED		8
    //#define EXIT_ZOMBIE		16
    //#define EXIT_DEAD		32
	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
    //這個結構體保存了進程描述符中中頻繁訪問和需要快速訪問的字段,內核依賴於該數據結構來獲得當前進程的描述符。通過源碼可以看到,該struct內擁有指向task_struct的指針
	struct thread_info *thread_info;
    
	atomic_t usage;
    //進程標誌,描述進程當前的狀態(不是運行狀態),如:PF_SUPERPRIV表示進程擁有超級用戶特權。。
	unsigned long flags;	/* per process flags, defined below */
    //系統調用跟蹤
	unsigned long ptrace;
	//內核鎖標誌(判斷是否被上鎖)
	int lock_depth;		/* Lock depth */
	//進程優先級
	int prio, static_prio;
	struct list_head run_list;
    //進程調度隊列
	prio_array_t *array;
	//進程平均等待時間
	unsigned long sleep_avg;
    //timestamp:進程最近插入運行隊列的時間或最近一次進程切換的時間
    //last_ran:最近一次替換本進程的進程切換時間
	unsigned long long timestamp, last_ran;
    //進程被喚醒時所使用的條件代碼,就是從什麼狀態被喚醒的。
	int activated;
	//進程的調度類型
	unsigned long policy;
	cpumask_t cpus_allowed;
    //time_slice:進程的剩餘時間片
    //first_time_slice:創建後首次獲取的時間片,爲1表示當前的時間片是從父進程分來的
	unsigned int time_slice, first_time_slice;

#ifdef CONFIG_SCHEDSTATS
	struct sched_info sched_info;
#endif

	struct list_head tasks;
	/*
	 * ptrace_list/ptrace_children forms the list of my children
	 * that were stolen by a ptracer.
	 */
	struct list_head ptrace_children;
	struct list_head ptrace_list;
	//mm:內存描述符,其下有程地址空間下的虛擬內存信息
    //actvie_mm:內核線程所借用的地址空間
	struct mm_struct *mm,active_mm;

/* task state */
	struct linux_binfmt *binfmt;
    //進程的退出狀態,大於0表示僵死
	long exit_state;
    //exit_code:存放進程的退出碼
    //exit_signal:當進程退出時發給父進程的信號,如果是輕量級進程爲-1
	int exit_code, exit_signal;
	int pdeath_signal;  /*  The signal sent when the parent dies  */
	/* ??? */
	unsigned long personality;
    //記錄是否執行execve系統調用
	unsigned did_exec:1;
    //進程id
	pid_t pid;
    //所在線程組領頭進程的PID
	pid_t tgid;
	/* 
	 * pointers to (original) parent process, youngest child, younger sibling,
	 * older sibling, respectively.  (p->father can be replaced with 
	 * p->parent->pid)
	 */
    //
	struct task_struct *real_parent; /* real parent process (when being debugged) */
    //指向P的當前父進程
	struct task_struct *parent;	/* parent process */
	/*
	 * children/sibling forms the list of my children plus the
	 * tasks I'm ptracing.
	 */
    //鏈表的頭部,鏈表中所有元素都是P創建的子進程
	struct list_head children;	/* list of my children */
	//兄弟進程之間相連接的鏈表
    struct list_head sibling;	/* linkage in my parent's children list */
	//P所在進程組的領頭進程
    struct task_struct *group_leader;	/* threadgroup leader */
	//每個進程有四個PID,把這四個PID掛到PID HASH表裏的不同位置,這樣從PID到task就很快了
	/* PID/PID hash table linkage. */
	struct pid pids[PIDTYPE_MAX];
    //爲vfork()用來等待子進程的隊列
	struct completion *vfork_done;		/* for vfork() */
	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
	//進程的實時優先級
	unsigned long rt_priority;
    //以下爲一些時間與定時信息
	unsigned long it_real_value, it_real_incr;
	cputime_t it_virt_value, it_virt_incr;
	cputime_t it_prof_value, it_prof_incr;
	struct timer_list real_timer;
	cputime_t utime, stime;
	unsigned long nvcsw, nivcsw; /* context switch counts */
	struct timespec start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
	unsigned long min_flt, maj_flt;
/* process credentials */
	uid_t uid,euid,suid,fsuid;
	gid_t gid,egid,sgid,fsgid;
	struct group_info *group_info;
	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
	unsigned keep_capabilities:1;
	struct user_struct *user;
#ifdef CONFIG_KEYS
	struct key *session_keyring;	/* keyring inherited over fork */
	struct key *process_keyring;	/* keyring private to this process (CLONE_THREAD) */
	struct key *thread_keyring;	/* keyring private to this thread */
#endif
	int oomkilladj; /* OOM kill score adjustment (bit shift). */
	char comm[TASK_COMM_LEN];
/* file system info */
	int link_count, total_link_count;
/* ipc stuff */
	struct sysv_sem sysvsem;
/* CPU-specific state of this task */
	struct thread_struct thread;
/* filesystem information */
    //進程的可執行映象所在的文件系統
	struct fs_struct *fs;
/* open file information */
    //進程打開的文件
	struct files_struct *files;
/* namespace */
	struct namespace *namespace;
/* signal handlers */
	struct signal_struct *signal;
	struct sighand_struct *sighand;

	sigset_t blocked, real_blocked;
	struct sigpending pending;

	unsigned long sas_ss_sp;
	size_t sas_ss_size;
	int (*notifier)(void *priv);
	void *notifier_data;
	sigset_t *notifier_mask;
	
	void *security;
	struct audit_context *audit_context;

/* Thread group tracking */
   	u32 parent_exec_id;
   	u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
	spinlock_t alloc_lock;
/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
	spinlock_t proc_lock;
/* context-switch lock */
	spinlock_t switch_lock;

/* journalling filesystem info */
	void *journal_info;

/* VM state */
	struct reclaim_state *reclaim_state;

	struct dentry *proc_dentry;
	struct backing_dev_info *backing_dev_info;

	struct io_context *io_context;

	unsigned long ptrace_message;
	siginfo_t *last_siginfo; /* For ptrace use.  */
/*
 * current io wait handle: wait queue entry to use for io waits
 * If this thread is processing aio, this points at the waitqueue
 * inside the currently handled kiocb. It may be NULL (i.e. default
 * to a stack based synchronous wait) if its doing sync IO.
 */
	wait_queue_t *io_wait;
/* i/o counters(bytes read/written, #syscalls */
	u64 rchar, wchar, syscr, syscw;
#if defined(CONFIG_BSD_PROCESS_ACCT)
	u64 acct_rss_mem1;	/* accumulated rss usage */
	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
	clock_t acct_stimexpd;	/* clock_t-converted stime since last update */
#endif
#ifdef CONFIG_NUMA
  	struct mempolicy *mempolicy;
	short il_next;
#endif
};

mm_struct

可以看到,task_struct具有很多字段,其包含了進程狀態、內存、調度、文件系統、時間分配等各種信息,上面我也只是給出了部分的註釋,抓住重點的虛擬內存描述符mm_struct,我們進一步查看其源碼:

struct mm_struct {
    //指向線性區對象的鏈表頭
	struct vm_area_struct * mmap;		/* list of VMAs */
    //指向線性區對象的紅黑樹
	struct rb_root mm_rb;
    //指向最後一個引用的線性區對象
	struct vm_area_struct * mmap_cache;	/* last find_vma result */
    //在進程地址空間中搜索有效線性地址區間的方法
	unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
    //釋放線性區時調用的方法
	void (*unmap_area) (struct vm_area_struct *area);
    // 標識第一個分配的匿名線性區或者是文件內存映射的線性地址
	unsigned long mmap_base;		/* base of mmap area */
    //內核從這個地址開始搜索進程地址空間中線性地址的空閒區間
	unsigned long free_area_cache;		/* first hole */
    //指向頁表
	pgd_t * pgd;
	atomic_t mm_users;			/* How many users with user space? */
	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
	int map_count;				/* number of VMAs */
	struct rw_semaphore mmap_sem;
	//線性區的自旋鎖和頁表的自旋鎖
    spinlock_t page_table_lock;		/* Protects page tables, mm->rss, mm->anon_rss */

	struct list_head mmlist;		/* List of maybe swapped mm's.  These are globally strung
						 * together off init_mm.mmlist, and are protected
						 * by mmlist_lock
						 */

    //各個片段的起始地址和終止地址
	unsigned long start_code, end_code, start_data, end_data;
	unsigned long start_brk, brk, start_stack;
	unsigned long arg_start, arg_end, env_start, env_end;
	unsigned long rss, anon_rss, total_vm, locked_vm, shared_vm;
	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;

	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */

	unsigned dumpable:1;
	cpumask_t cpu_vm_mask;

	/* Architecture-specific MM context */
	mm_context_t context;

	/* Token based thrashing protection. */
	unsigned long swap_token_time;
	char recent_pagein;

	/* coredumping support */
	int core_waiters;
	struct completion *core_startup_done, core_done;

	/* aio bits */
	rwlock_t		ioctx_list_lock;
	struct kioctx		*ioctx_list;

	struct kioctx		default_kioctx;

	unsigned long hiwater_rss;	/* High-water RSS usage */
	unsigned long hiwater_vm;	/* High-water virtual memory usage */
};

其中這裏要講到的字段爲mmap,其指向線性區對象的鏈表頭,而對於pgd,其指向該進程的頁表。

在地址空間中,mmap爲地址空間的內存區域(用vm_area_struct結構來表示)鏈表,mm_rb用紅黑樹來存儲,鏈表表示起來更加方便,紅黑樹表示起來更加方便查找。區別是,當虛擬區較少的時候,這個時候採用單鏈表,由mmap指向這個鏈表,當虛擬區多時此時採用紅黑樹的結構,由mm_rb指向這棵紅黑樹。這樣就可以在大量數據的時候效率更高。所有的mm_struct結構體通過自身的mm_list域鏈接在一個雙向鏈表上,該鏈表的首元素是init_mm內存描述符,代表init進程的地址空間。

vm_area_struct

對於mmap指向的vm_area_struct,我們繼續深入源碼:

struct vm_area_struct {
    //指向vm_mm
	struct mm_struct * vm_mm;	/* The address space we belong to. */
	//該虛擬內存空間的首地址
    unsigned long vm_start;		/* Our start address within vm_mm. */
	//該虛擬內存空間的尾地址
    unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */

    //VMA鏈表的下一個成員
	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next;

	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
	//保存VMA標誌位
    unsigned long vm_flags;		/* Flags, listed below. */
	//將本VMA作爲一個節點加入到紅黑樹中
	struct rb_node vm_rb;
    ...
}

至此,我們可以看出,虛擬內存即爲由一個個vm_area_struct結構體,通過鏈表組裝起來的空間。

因此,結合個人的理解,及對計算機原理的認識,我嘗試模擬作出了linux中虛擬內存的結構圖,如下:
在這裏插入圖片描述
但實際上,並非每個vm_area_struct對應指向內存的每個段(每個segment可能由多個VMA組成),對於vm_area_struct,其描述的是一段連續的、具有相同訪問屬性的虛存空間,該虛存空間的大小爲物理內存頁面的整數倍

雜談

以上的示意圖及分析是結合自身的理解所談,有所班門弄斧的味道,如有不足之處還望指出;

同時,近期學得越多,發現自己不會得也越多。無論是在計算機系統上,還是在linux源碼上,有太多的知識點我未曾涉獵,還是需要多學習、多總結、多思考。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章