前言
在虛擬內存中,我提到了linux虛擬內存區域的結構,但具體其是如何在linux中表示與實現的呢?
我利用了linux2.6的源碼進行了淺顯的分析。
正文
task_struct
在linux中,進程控制塊即PCB的結構爲task_struct,我們以linux2.6爲例,其源碼如下:
struct task_struct {
//表示進程當前運行狀態
//volatile避免了讀取到緩存在寄存器中的髒數據,而是直接從內存中取
//可以看到state基本有三種,但大於0還分爲很多不同的狀態
//#define TASK_RUNNING 0
//#define TASK_INTERRUPTIBLE 1
//#define TASK_UNINTERRUPTIBLE 2
//#define TASK_STOPPED 4
//#define TASK_TRACED 8
//#define EXIT_ZOMBIE 16
//#define EXIT_DEAD 32
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
//這個結構體保存了進程描述符中中頻繁訪問和需要快速訪問的字段,內核依賴於該數據結構來獲得當前進程的描述符。通過源碼可以看到,該struct內擁有指向task_struct的指針
struct thread_info *thread_info;
atomic_t usage;
//進程標誌,描述進程當前的狀態(不是運行狀態),如:PF_SUPERPRIV表示進程擁有超級用戶特權。。
unsigned long flags; /* per process flags, defined below */
//系統調用跟蹤
unsigned long ptrace;
//內核鎖標誌(判斷是否被上鎖)
int lock_depth; /* Lock depth */
//進程優先級
int prio, static_prio;
struct list_head run_list;
//進程調度隊列
prio_array_t *array;
//進程平均等待時間
unsigned long sleep_avg;
//timestamp:進程最近插入運行隊列的時間或最近一次進程切換的時間
//last_ran:最近一次替換本進程的進程切換時間
unsigned long long timestamp, last_ran;
//進程被喚醒時所使用的條件代碼,就是從什麼狀態被喚醒的。
int activated;
//進程的調度類型
unsigned long policy;
cpumask_t cpus_allowed;
//time_slice:進程的剩餘時間片
//first_time_slice:創建後首次獲取的時間片,爲1表示當前的時間片是從父進程分來的
unsigned int time_slice, first_time_slice;
#ifdef CONFIG_SCHEDSTATS
struct sched_info sched_info;
#endif
struct list_head tasks;
/*
* ptrace_list/ptrace_children forms the list of my children
* that were stolen by a ptracer.
*/
struct list_head ptrace_children;
struct list_head ptrace_list;
//mm:內存描述符,其下有程地址空間下的虛擬內存信息
//actvie_mm:內核線程所借用的地址空間
struct mm_struct *mm,active_mm;
/* task state */
struct linux_binfmt *binfmt;
//進程的退出狀態,大於0表示僵死
long exit_state;
//exit_code:存放進程的退出碼
//exit_signal:當進程退出時發給父進程的信號,如果是輕量級進程爲-1
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
/* ??? */
unsigned long personality;
//記錄是否執行execve系統調用
unsigned did_exec:1;
//進程id
pid_t pid;
//所在線程組領頭進程的PID
pid_t tgid;
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->parent->pid)
*/
//
struct task_struct *real_parent; /* real parent process (when being debugged) */
//指向P的當前父進程
struct task_struct *parent; /* parent process */
/*
* children/sibling forms the list of my children plus the
* tasks I'm ptracing.
*/
//鏈表的頭部,鏈表中所有元素都是P創建的子進程
struct list_head children; /* list of my children */
//兄弟進程之間相連接的鏈表
struct list_head sibling; /* linkage in my parent's children list */
//P所在進程組的領頭進程
struct task_struct *group_leader; /* threadgroup leader */
//每個進程有四個PID,把這四個PID掛到PID HASH表裏的不同位置,這樣從PID到task就很快了
/* PID/PID hash table linkage. */
struct pid pids[PIDTYPE_MAX];
//爲vfork()用來等待子進程的隊列
struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
//進程的實時優先級
unsigned long rt_priority;
//以下爲一些時間與定時信息
unsigned long it_real_value, it_real_incr;
cputime_t it_virt_value, it_virt_incr;
cputime_t it_prof_value, it_prof_incr;
struct timer_list real_timer;
cputime_t utime, stime;
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;
/* process credentials */
uid_t uid,euid,suid,fsuid;
gid_t gid,egid,sgid,fsgid;
struct group_info *group_info;
kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
unsigned keep_capabilities:1;
struct user_struct *user;
#ifdef CONFIG_KEYS
struct key *session_keyring; /* keyring inherited over fork */
struct key *process_keyring; /* keyring private to this process (CLONE_THREAD) */
struct key *thread_keyring; /* keyring private to this thread */
#endif
int oomkilladj; /* OOM kill score adjustment (bit shift). */
char comm[TASK_COMM_LEN];
/* file system info */
int link_count, total_link_count;
/* ipc stuff */
struct sysv_sem sysvsem;
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
//進程的可執行映象所在的文件系統
struct fs_struct *fs;
/* open file information */
//進程打開的文件
struct files_struct *files;
/* namespace */
struct namespace *namespace;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked, real_blocked;
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
void *security;
struct audit_context *audit_context;
/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
spinlock_t alloc_lock;
/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
spinlock_t proc_lock;
/* context-switch lock */
spinlock_t switch_lock;
/* journalling filesystem info */
void *journal_info;
/* VM state */
struct reclaim_state *reclaim_state;
struct dentry *proc_dentry;
struct backing_dev_info *backing_dev_info;
struct io_context *io_context;
unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
/*
* current io wait handle: wait queue entry to use for io waits
* If this thread is processing aio, this points at the waitqueue
* inside the currently handled kiocb. It may be NULL (i.e. default
* to a stack based synchronous wait) if its doing sync IO.
*/
wait_queue_t *io_wait;
/* i/o counters(bytes read/written, #syscalls */
u64 rchar, wchar, syscr, syscw;
#if defined(CONFIG_BSD_PROCESS_ACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
clock_t acct_stimexpd; /* clock_t-converted stime since last update */
#endif
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
#endif
};
mm_struct
可以看到,task_struct具有很多字段,其包含了進程狀態、內存、調度、文件系統、時間分配等各種信息,上面我也只是給出了部分的註釋,抓住重點的虛擬內存描述符mm_struct,我們進一步查看其源碼:
struct mm_struct {
//指向線性區對象的鏈表頭
struct vm_area_struct * mmap; /* list of VMAs */
//指向線性區對象的紅黑樹
struct rb_root mm_rb;
//指向最後一個引用的線性區對象
struct vm_area_struct * mmap_cache; /* last find_vma result */
//在進程地址空間中搜索有效線性地址區間的方法
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
//釋放線性區時調用的方法
void (*unmap_area) (struct vm_area_struct *area);
// 標識第一個分配的匿名線性區或者是文件內存映射的線性地址
unsigned long mmap_base; /* base of mmap area */
//內核從這個地址開始搜索進程地址空間中線性地址的空閒區間
unsigned long free_area_cache; /* first hole */
//指向頁表
pgd_t * pgd;
atomic_t mm_users; /* How many users with user space? */
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
int map_count; /* number of VMAs */
struct rw_semaphore mmap_sem;
//線性區的自旋鎖和頁表的自旋鎖
spinlock_t page_table_lock; /* Protects page tables, mm->rss, mm->anon_rss */
struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
* together off init_mm.mmlist, and are protected
* by mmlist_lock
*/
//各個片段的起始地址和終止地址
unsigned long start_code, end_code, start_data, end_data;
unsigned long start_brk, brk, start_stack;
unsigned long arg_start, arg_end, env_start, env_end;
unsigned long rss, anon_rss, total_vm, locked_vm, shared_vm;
unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
unsigned dumpable:1;
cpumask_t cpu_vm_mask;
/* Architecture-specific MM context */
mm_context_t context;
/* Token based thrashing protection. */
unsigned long swap_token_time;
char recent_pagein;
/* coredumping support */
int core_waiters;
struct completion *core_startup_done, core_done;
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
struct kioctx default_kioctx;
unsigned long hiwater_rss; /* High-water RSS usage */
unsigned long hiwater_vm; /* High-water virtual memory usage */
};
其中這裏要講到的字段爲mmap,其指向線性區對象的鏈表頭,而對於pgd,其指向該進程的頁表。
在地址空間中,mmap爲地址空間的內存區域(用vm_area_struct結構來表示)鏈表,mm_rb用紅黑樹來存儲,鏈表表示起來更加方便,紅黑樹表示起來更加方便查找。區別是,當虛擬區較少的時候,這個時候採用單鏈表,由mmap指向這個鏈表,當虛擬區多時此時採用紅黑樹的結構,由mm_rb指向這棵紅黑樹。這樣就可以在大量數據的時候效率更高。所有的mm_struct結構體通過自身的mm_list域鏈接在一個雙向鏈表上,該鏈表的首元素是init_mm內存描述符,代表init進程的地址空間。
vm_area_struct
對於mmap指向的vm_area_struct,我們繼續深入源碼:
struct vm_area_struct {
//指向vm_mm
struct mm_struct * vm_mm; /* The address space we belong to. */
//該虛擬內存空間的首地址
unsigned long vm_start; /* Our start address within vm_mm. */
//該虛擬內存空間的尾地址
unsigned long vm_end; /* The first byte after our end address
within vm_mm. */
//VMA鏈表的下一個成員
/* linked list of VM areas per task, sorted by address */
struct vm_area_struct *vm_next;
pgprot_t vm_page_prot; /* Access permissions of this VMA. */
//保存VMA標誌位
unsigned long vm_flags; /* Flags, listed below. */
//將本VMA作爲一個節點加入到紅黑樹中
struct rb_node vm_rb;
...
}
至此,我們可以看出,虛擬內存即爲由一個個vm_area_struct結構體,通過鏈表組裝起來的空間。
因此,結合個人的理解,及對計算機原理的認識,我嘗試模擬作出了linux中虛擬內存的結構圖,如下:
但實際上,並非每個vm_area_struct對應指向內存的每個段(每個segment可能由多個VMA組成),對於vm_area_struct,其描述的是一段連續的、具有相同訪問屬性的虛存空間,該虛存空間的大小爲物理內存頁面的整數倍。
雜談
以上的示意圖及分析是結合自身的理解所談,有所班門弄斧的味道,如有不足之處還望指出;
同時,近期學得越多,發現自己不會得也越多。無論是在計算機系統上,還是在linux源碼上,有太多的知識點我未曾涉獵,還是需要多學習、多總結、多思考。