閱讀本文需要linux文件系統基礎知識。以下引用的kernel源代碼,都是基於linux kernel源代碼版本:3.4。
以下分析基於下面的假設:
根設備使用SD卡設備(塊設備),根文件系統使用ext4文件系統。
linux kernel在初始化的最後階段,會加載“根文件系統”,按照前面的假設,也就是加載一個ext4文件系統作爲根文件系統,這個文件系統位於SD卡上。
在加載這個根文件系統前,kernel會先加載一個虛擬的根文件系統,名叫rootfs文件系統,這個rootfs文件系統纔是kernel真正意義上mount的第一個文件系統。它是基於內存的文件系統(並沒有對應一個非易失性存儲設備,而像ext, fat這些文件系統都會對應一個外部存儲設備)。
這個rootfs 文件系統會mount到自己的根目錄 “/” 上,mount完畢後會將自己的根目錄設爲進程的根目錄。現在假設這個rootfs文件系統已經mount好了(暫不分析rootfs文件系統的mount過程)。
kernel在安裝根文件系統前會先安裝根設備節點:/dev/root。ext4根文件系統必須存在於一個根設備上,比如硬盤,SD/MMC卡。根設備節點用來描述指定的根設備。
kernel在kernel_init函數中會調用prepare_namespace函數(init/do_mounts.c),prepare_namespace函數先從啓動參數中獲得根設備名稱,保存在root_device_name中,這裏名稱就是/dev/mmcblk0p1。
調用mount_root掛載根文件系統。在mount_root函數中就會創建設備節點了。
void __init prepare_namespace(void)
{
int is_floppy;
if (root_delay) {
printk(KERN_INFO "Waiting %dsec before mounting root device...\n",
root_delay);
ssleep(root_delay);
}
/*
* wait for the known devices to complete their probing
*
* Note: this is a potential source of long boot delays.
* For example, it is not atypical to wait 5 seconds here
* for the touchpad of a laptop to initialize.
*/
wait_for_device_probe();
md_run_setup();
if (saved_root_name[0]) {
root_device_name = saved_root_name;
if (!strncmp(root_device_name, "mtd", 3) ||
!strncmp(root_device_name, "ubi", 3)) {
mount_block_root(root_device_name, root_mountflags);
goto out;
}
printk(KERN_INFO "-----ROOT_DEV_NAME----: %s\n", root_device_name);
ROOT_DEV = name_to_dev_t(root_device_name); //拿到設備名稱後調用name_to_dev_t轉換爲設備號;
if (strncmp(root_device_name, "/dev/", 5) == 0)
root_device_name += 5; //跳過 ‘/dev/’;
}
if (initrd_load())
goto out;
/* wait for any asynchronous scanning to complete */
if ((ROOT_DEV == 0) && root_wait) {
printk(KERN_INFO "Waiting for root device %s...\n",
saved_root_name);
while (driver_probe_done() != 0 ||
(ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)
msleep(100);
async_synchronize_full();
}
is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR;
if (is_floppy && rd_doload && rd_load_disk(0))
ROOT_DEV = Root_RAM0;
mount_root(); //安裝根文件系統;
out:
devtmpfs_mount("dev");
sys_mount(".", "/", NULL, MS_MOVE, NULL);
sys_chroot((const char __user __force *)".");
}
mount_root(): (init/do_mounts.c)
void __init mount_root(void)
{
#ifdef CONFIG_ROOT_NFS
if (ROOT_DEV == Root_NFS) {
if (mount_nfs_root())
return;
printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n");
ROOT_DEV = Root_FD0;
}
#endif
#ifdef CONFIG_BLK_DEV_FD
if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
/* rd_doload is 2 for a dual initrd/ramload setup */
if (rd_doload==2) {
if (rd_load_disk(1)) {
ROOT_DEV = Root_RAM1;
root_device_name = NULL;
}
} else
change_floppy("root floppy");
}
#endif
#ifdef CONFIG_BLOCK
create_dev("/dev/root", ROOT_DEV); //創建根設備節點;
mount_block_root("/dev/root", root_mountflags);
#endif
}
這個函數在create_dev函數中創建根設備節點。create_dev函數實際調用了mknod系統調用對應的內核函數sys_mknod。
參數爲: name = "/dev/root", mode = S_IFBLK|0600,dev = xxx;
static inline int create_dev(char *name, dev_t dev)
{
sys_unlink(name);
return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev));
}
SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
return sys_mknodat(AT_FDCWD, filename, mode, dev);
}
最後實際是調用sys_mknodat函數(fs/namei.c),SYSCALL_DEFINE4是函數定義的宏,展開後就變成sys_mknodat函數。
sys_mknodat函數調用user_path_create得到要創建節點的path,節點父目錄的信息保存在path中,然後調用vfs_mknod創建節點。
SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
unsigned, dev)
{
struct dentry *dentry;
struct path path;
int error;
if (S_ISDIR(mode))
return -EPERM;
dentry = user_path_create(dfd, filename, &path, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
if (!IS_POSIXACL(path.dentry->d_inode))
mode &= ~current_umask();
error = may_mknod(mode);
if (error)
goto out_dput;
error = mnt_want_write(path.mnt);
if (error)
goto out_dput;
error = security_path_mknod(&path, dentry, mode, dev);
if (error)
goto out_drop_write;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(path.dentry->d_inode,dentry,mode,NULL);
break;
case S_IFCHR: case S_IFBLK:
error = vfs_mknod(path.dentry->d_inode,dentry,mode,
new_decode_dev(dev));
break;
case S_IFIFO: case S_IFSOCK:
error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
break;
}
out_drop_write:
mnt_drop_write(path.mnt);
out_dput:
dput(dentry);
mutex_unlock(&path.dentry->d_inode->i_mutex);
path_put(&path);
return error;
}
user_path_create: (fs/namei.c)
簡單的調用了kern_path_create函數後返回。此時參數分別爲dfd = AT_FDCWD,tmp = "/dev/root",path爲輸出參數,is_dir = 0;
struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
{
char *tmp = getname(pathname);
struct dentry *res;
if (IS_ERR(tmp))
return ERR_CAST(tmp);
res = kern_path_create(dfd, tmp, path, is_dir);
putname(tmp);
return res;
}
kern_path_create函數先調用do_path_lookup搜索路徑,搜索方式是搜索最後一個目錄項的父目錄。然後再調用lookup_hash搜索目錄最後一項,也就是root項。
struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct nameidata nd;
int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
if (error)
return ERR_PTR(error);
/*
* Yucky last component or no last component at all?
* (foo/., foo/.., /////)
*/
if (nd.last_type != LAST_NORM)
goto out;
nd.flags &= ~LOOKUP_PARENT;
nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
nd.intent.open.flags = O_EXCL;
/*
* Do the final lookup.
*/
mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
dentry = lookup_hash(&nd);
if (IS_ERR(dentry))
goto fail;
if (dentry->d_inode)
goto eexist;
/*
* Special case - lookup gave negative, but... we had foo/bar/
* From the vfs_mknod() POV we just have a negative dentry -
* all is fine. Let's be bastards - you had / on the end, you've
* been asking for (non-existent) directory. -ENOENT for you.
*/
if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
dput(dentry);
dentry = ERR_PTR(-ENOENT);
goto fail;
}
*path = nd.path;
return dentry;
eexist:
dput(dentry);
dentry = ERR_PTR(-EEXIST);
fail:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
out:
path_put(&nd.path);
return dentry;
}
do_path_lookup函數如下(fs/namei.c)。
調用path_lookupat進行搜索。加上LOOKUP_RCU參數。vfs文件系統路徑搜索主要有兩種mode,rcu-walk和 ref-walk。 這兩種方式在源代碼目錄 Documentation/filesystems/path-lookup.txt 的文檔中有詳細說明。
static int do_path_lookup(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
if (unlikely(retval == -ECHILD))
retval = path_lookupat(dfd, name, flags, nd);
if (unlikely(retval == -ESTALE))
retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
if (likely(!retval)) {
if (unlikely(!audit_dummy_context())) {
if (nd->path.dentry && nd->inode)
audit_inode(name, nd->path.dentry);
}
}
return retval;
}
path_lookupat函數先初始化路徑,可以看作將設置路徑的根(搜索的起點)。VFS中目錄項是用dentry和inode描述的,這裏可以看作設置根路徑對應的dentry。
static int path_lookupat(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
struct file *base = NULL;
struct path path;
int err;
/*
* Path walking is largely split up into 2 different synchronisation
* schemes, rcu-walk and ref-walk (explained in
* Documentation/filesystems/path-lookup.txt). These share much of the
* path walk code, but some things particularly setup, cleanup, and
* following mounts are sufficiently divergent that functions are
* duplicated. Typically there is a function foo(), and its RCU
* analogue, foo_rcu().
*
* -ECHILD is the error number of choice (just to avoid clashes) that
* is returned if some aspect of an rcu-walk fails. Such an error must
* be handled by restarting a traditional ref-walk (which will always
* be able to complete).
*/
err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); //找到搜索的起點,保存在nd中;
if (unlikely(err))
return err;
current->total_link_count = 0;
err = link_path_walk(name, nd); //從起點開始路徑的搜索,其中nd用來返回搜索結果;
if (!err && !(flags & LOOKUP_PARENT)) {
err = lookup_last(nd, &path);
while (err > 0) {
void *cookie;
struct path link = path;
nd->flags |= LOOKUP_PARENT;
err = follow_link(&link, nd, &cookie);
if (!err)
err = lookup_last(nd, &path);
put_link(nd, &link, cookie);
}
}
if (!err)
err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY) {
if (!nd->inode->i_op->lookup) {
path_put(&nd->path);
err = -ENOTDIR;
}
}
if (base)
fput(base);
if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
path_put(&nd->root);
nd->root.mnt = NULL;
}
return err;
}
主要調用set_root_rcu設置根路徑,將根路徑保存在nd->root。
static int path_init(int dfd, const char *name, unsigned int flags,
struct nameidata *nd, struct file **fp)
{
int retval = 0;
int fput_needed;
struct file *file;
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED;
nd->depth = 0;
if (flags & LOOKUP_ROOT) {
struct inode *inode = nd->root.dentry->d_inode;
if (*name) {
if (!inode->i_op->lookup)
return -ENOTDIR;
retval = inode_permission(inode, MAY_EXEC);
if (retval)
return retval;
}
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
br_read_lock(vfsmount_lock);
rcu_read_lock();
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} else {
path_get(&nd->path);
}
return 0;
}
nd->root.mnt = NULL;
if (*name=='/') {
if (flags & LOOKUP_RCU) {
br_read_lock(vfsmount_lock);
rcu_read_lock();
set_root_rcu(nd); //設置根路徑
} else {
set_root(nd);
path_get(&nd->root);
}
nd->path = nd->root; //搜索的初始路徑設爲根路徑
} else if (dfd == AT_FDCWD) {
if (flags & LOOKUP_RCU) {
struct fs_struct *fs = current->fs;
unsigned seq;
br_read_lock(vfsmount_lock);
rcu_read_lock();
do {
seq = read_seqcount_begin(&fs->seq);
nd->path = fs->pwd;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
}
} else {
struct dentry *dentry;
file = fget_raw_light(dfd, &fput_needed);
retval = -EBADF;
if (!file)
goto out_fail;
dentry = file->f_path.dentry;
if (*name) {
retval = -ENOTDIR;
if (!S_ISDIR(dentry->d_inode->i_mode))
goto fput_fail;
retval = inode_permission(dentry->d_inode, MAY_EXEC);
if (retval)
goto fput_fail;
}
nd->path = file->f_path;
if (flags & LOOKUP_RCU) {
if (fput_needed)
*fp = file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
br_read_lock(vfsmount_lock);
rcu_read_lock();
} else {
path_get(&file->f_path);
fput_light(file, fput_needed);
}
}
nd->inode = nd->path.dentry->d_inode; //設置根目錄的inode
return 0;
fput_fail:
fput_light(file, fput_needed);
out_fail:
return retval;
}
用當前進程的根路徑設置nd->root。
static __always_inline void set_root_rcu(struct nameidata *nd)
{
if (!nd->root.mnt) {
struct fs_struct *fs = current->fs;
unsigned seq;
do {
seq = read_seqcount_begin(&fs->seq);
nd->root = fs->root; //設置nd的root爲當前進程fs的root;
nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
} while (read_seqcount_retry(&fs->seq, seq));
}
}
path_init返回後,調用link_path_walk正式開始搜索,搜索的結果會保存在struct nameidata結構體nd中。
struct nameidata {
struct path path;
struct qstr last; //保存最後一個目錄項
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags;
unsigned seq;
int last_type;
unsigned depth;
char *saved_names[MAX_NESTED_LINKS + 1];
/* Intent data */
union {
struct open_intent open;
} intent;
};
如果路徑名以‘/’開頭,就把它跳過去,因爲在這種情況下nd->path已經指向本進程的根目錄了。如果路徑名中僅僅包含‘/’字符的話,那麼其搜索目標就是根目錄,所以任務完成。
調用hash_name,hash_name函數計算第一個目錄項的hash值並返回目錄項長度,對於本例中的/dev/root,第一個目錄項是dev,長度爲3。用name和len構造struct qstr結構體。若發現是最後一個目錄項,就將struct qstr賦給nd->last, 然後直接返回,若不是最後一項,就調用walk_component搜索這個目錄項。
static int link_path_walk(const char *name, struct nameidata *nd)
{
struct path next;
int err;
while (*name=='/')
name++;
if (!*name)
return 0;
/* At this point we know we have a real path component. */
for(;;) {
struct qstr this;
long len;
int type;
err = may_lookup(nd);
if (err)
break;
len = hash_name(name, &this.hash);
this.name = name;
this.len = len;
type = LAST_NORM;
if (name[0] == '.') switch (len) {
case 2:
if (name[1] == '.') {
type = LAST_DOTDOT;
nd->flags |= LOOKUP_JUMPED;
}
break;
case 1:
type = LAST_DOT;
}
if (likely(type == LAST_NORM)) {
struct dentry *parent = nd->path.dentry;
nd->flags &= ~LOOKUP_JUMPED;
if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
err = parent->d_op->d_hash(parent, nd->inode,
&this);
if (err < 0)
break;
}
}
if (!name[len]) //若是最後一項,就直接返回;
goto last_component;
/*
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
do {
len++;
} while (unlikely(name[len] == '/'));
if (!name[len])
goto last_component;
name += len; //運行到這裏,表示當前節點爲中間節點;
err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); //搜索目錄項;
if (err < 0)
return err;
if (err) {
err = nested_symlink(&next, nd);
if (err)
return err;
}
if (can_lookup(nd->inode))
continue;
err = -ENOTDIR;
break;
/* here ends the main loop */
last_component:
nd->last = this;
nd->last_type = type;
return 0;
}
terminate_walk(nd);
return err;
}
權限檢查。
static inline int may_lookup(struct nameidata *nd)
{
if (nd->flags & LOOKUP_RCU) {
int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
if (unlazy_walk(nd, NULL))
return -ECHILD;
}
return inode_permission(nd->inode, MAY_EXEC);
}
hash_name返回第一個目錄項的長度。
static inline unsigned long hash_name(const char *name, unsigned int *hashp)
{
unsigned long hash = init_name_hash(); // #define init_name_hash() 0;
unsigned long len = 0, c;
c = (unsigned char)*name;
do {
len++;
hash = partial_name_hash(c, hash); // 計算並返回哈希值;
c = (unsigned char)name[len];
} while (c && c != '/');
*hashp = end_name_hash(hash); // return hash;
return len;
}
計算hash值;
static inline unsigned long
partial_name_hash(unsigned long c, unsigned long prevhash)
{
return (prevhash + (c << 4) + (c >> 4)) * 11;
}
walk_component: (fs/namei.c)
調用do_lookup函數搜索目錄項的inode。如果找不到inode節點,則終止搜索。如果這個inode節點被mount上另一個文件系統,更新搜索路徑。
static inline int walk_component(struct nameidata *nd, struct path *path,
struct qstr *name, int type, int follow)
{
struct inode *inode;
int err;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if (unlikely(type != LAST_NORM))
return handle_dots(nd, type);
err = do_lookup(nd, name, path, &inode);
if (unlikely(err)) {
terminate_walk(nd);
return err;
}
if (!inode) {
path_to_nameidata(path, nd);
terminate_walk(nd);
return -ENOENT;
}
if (should_follow_link(inode, follow)) {
if (nd->flags & LOOKUP_RCU) {
if (unlikely(unlazy_walk(nd, path->dentry))) {
terminate_walk(nd);
return -ECHILD;
}
}
BUG_ON(inode != path->dentry->d_inode);
return 1;
}
path_to_nameidata(path, nd); //將path的內容轉化到nd中;
nd->inode = inode;
return 0;
}
do_lookup: (fs/namei.c) 做實際的路徑搜索工作。
do_lookup先獲取要搜索目錄項的父目錄parent,根據LOOPUP_RCU標誌,調用__d_lookup_rcu或者__d_lookup。假設沒有LOOKUP_RCU標誌,則進入__d_lookup函數,若找到則__d_lookup返回找到的dentry,用找到的dentry給path->dentry賦值,將dentry對應的dentry->d_inode賦給輸出參數inode,inode有可能爲NULL。
static int do_lookup(struct nameidata *nd, struct qstr *name,
struct path *path, struct inode **inode)
{
struct vfsmount *mnt = nd->path.mnt;
struct dentry *dentry, *parent = nd->path.dentry;
int need_reval = 1;
int status = 1;
int err;
/*
* Rename seqlock is not required here because in the off chance
* of a false negative due to a concurrent rename, we're going to
* do the non-racy lookup, below.
*/
if (nd->flags & LOOKUP_RCU) {
unsigned seq;
*inode = nd->inode;
dentry = __d_lookup_rcu(parent, name, &seq, inode);
if (!dentry)
goto unlazy;
/* Memory barrier in read_seqcount_begin of child is enough */
if (__read_seqcount_retry(&parent->d_seq, nd->seq))
return -ECHILD;
nd->seq = seq;
if (unlikely(d_need_lookup(dentry)))
goto unlazy;
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
status = d_revalidate(dentry, nd);
if (unlikely(status <= 0)) {
if (status != -ECHILD)
need_reval = 0;
goto unlazy;
}
}
path->mnt = mnt;
path->dentry = dentry;
if (unlikely(!__follow_mount_rcu(nd, path, inode)))
goto unlazy;
if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
goto unlazy;
return 0;
unlazy:
if (unlazy_walk(nd, dentry))
return -ECHILD;
} else {
dentry = __d_lookup(parent, name);
}
if (unlikely(!dentry))
goto need_lookup;
if (unlikely(d_need_lookup(dentry))) {
dput(dentry);
goto need_lookup;
}
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
status = d_revalidate(dentry, nd);
if (unlikely(status <= 0)) {
if (status < 0) {
dput(dentry);
return status;
}
if (!d_invalidate(dentry)) {
dput(dentry);
goto need_lookup;
}
}
done:
path->mnt = mnt;
path->dentry = dentry;
err = follow_managed(path, nd->flags);
if (unlikely(err < 0)) {
path_put_conditional(path, nd);
return err;
}
if (err)
nd->flags |= LOOKUP_JUMPED;
*inode = path->dentry->d_inode;
return 0;
need_lookup:
BUG_ON(nd->inode != parent->d_inode);
mutex_lock(&parent->d_inode->i_mutex);
dentry = __lookup_hash(name, parent, nd);
mutex_unlock(&parent->d_inode->i_mutex);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
goto done;
}
__d_lookup: (fs/dcache.c)
在內存中尋找該節點已經建立的dentry結構,內核中有個hash表dentry_hashtable,是一個struct hlist_bl_head結構的指針數組,每一個struct hlist_bl_head結構包含有一個隊列頭指針struct hlist_bl_node*。
一旦在內存中建立起一個目錄節點的dentry結構,就根據其節點名的hash值和父目錄的dentry值,調用d_hash獲得dentry_hashtable中的一個隊列頭指針,然後將dentry成員變量dentry->d_hash掛入dentry_hashtable表中的這個隊列,需要尋找時會根據hash值查找dentry_hashtable表。
將dentry->d_hash掛入dentry_hashtable的過程在d_rehash函數中完成,後面分析。
這裏__d_lookup先調用d_hash從dentry_hashtable表中獲得這個隊列。遍歷這個隊列,看看隊列中有沒有目錄項的hash值和要搜索的目錄項的hash值相同的,如果有,再比較parent是否是同一個parent,最後調用dentry_cmp進一步比較是否是同一個目錄項,如果是,表示找到了,就增加引用計數並將其返回。如果沒有找到,返回NULL。
struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
{
unsigned int len = name->len;
unsigned int hash = name->hash;
const unsigned char *str = name->name;
struct hlist_bl_head *b = d_hash(parent, hash);
struct hlist_bl_node *node;
struct dentry *found = NULL;
struct dentry *dentry;
/*
* Note: There is significant duplication with __d_lookup_rcu which is
* required to prevent single threaded performance regressions
* especially on architectures where smp_rmb (in seqcounts) are costly.
* Keep the two functions in sync.
*/
/*
* The hash list is protected using RCU.
*
* Take d_lock when comparing a candidate dentry, to avoid races
* with d_move().
*
* It is possible that concurrent renames can mess up our list
* walk here and result in missing our dentry, resulting in the
* false-negative result. d_lookup() protects against concurrent
* renames using rename_lock seqlock.
*
* See Documentation/filesystems/path-lookup.txt for more details.
*/
rcu_read_lock();
hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { //遍歷散列
const char *tname;
int tlen;
if (dentry->d_name.hash != hash)
continue;
spin_lock(&dentry->d_lock);
if (dentry->d_parent != parent)
goto next;
if (d_unhashed(dentry))
goto next;
/*
* It is safe to compare names since d_move() cannot
* change the qstr (protected by d_lock).
*/
tlen = dentry->d_name.len;
tname = dentry->d_name.name;
if (parent->d_flags & DCACHE_OP_COMPARE) {
if (parent->d_op->d_compare(parent, parent->d_inode,
dentry, dentry->d_inode,
tlen, tname, name))
goto next;
} else {
if (dentry_cmp(tname, tlen, str, len))
goto next;
}
dentry->d_count++;
found = dentry;
spin_unlock(&dentry->d_lock);
break;
next:
spin_unlock(&dentry->d_lock);
}
rcu_read_unlock();
return found;
}
d_hash爲散列函數,它將parent與hash值相結合,映射到dentry_hashtable表中,返回相應的散列鏈。
static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
unsigned int hash)
{
hash += (unsigned long) parent / L1_CACHE_BYTES;
hash = hash + (hash >> D_HASHBITS);
return dentry_hashtable + (hash & D_HASHMASK);
}
__d_lookup然後遍歷剛剛得到的隊列上的每個元素。hlist_bl_for_each_entry_rcu實現遍歷。
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \
for (pos = hlist_bl_first_rcu(head); \
pos && \
({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
pos = rcu_dereference_raw(pos->next))
static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
return (struct hlist_bl_node *)
((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);
}
struct hlist_bl_head {
struct hlist_bl_node *first;
};
struct hlist_bl_node {
struct hlist_bl_node *next, **pprev;
};
container_of返回包含d_hash的struct dentry結構體;container_of是linux內核常用的宏,能根據結構體內部成員的指針返回包含這個成員的結構體的指針。
#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)
執行到這裏,就層層返回到do_lookup,用找到的dentry給path賦值,接着調用follow_managed函數 (fs/namei.c)。
follow_managed處理一些有特殊用途的目錄項。比如標記爲mount點的目錄項,如果目錄項是一個mount點,那就切換到另一個文件系統,將path->dentry重新設爲新文件系統的根。
最後,do_lookup函數將找到的dentry->d_inode賦值給輸出參數inode,返回。
static int follow_managed(struct path *path, unsigned flags)
{
struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
unsigned managed;
bool need_mntput = false;
int ret = 0;
/* Given that we're not holding a lock here, we retain the value in a
* local variable for each dentry as we look at it so that we don't see
* the components of that value change under us */
while (managed = ACCESS_ONCE(path->dentry->d_flags),
managed &= DCACHE_MANAGED_DENTRY,
unlikely(managed != 0)) {
/* Allow the filesystem to manage the transit without i_mutex
* being held. */
if (managed & DCACHE_MANAGE_TRANSIT) {
BUG_ON(!path->dentry->d_op);
BUG_ON(!path->dentry->d_op->d_manage);
ret = path->dentry->d_op->d_manage(path->dentry, false);
if (ret < 0)
break;
}
/* Transit to a mounted filesystem. */
if (managed & DCACHE_MOUNTED) {
struct vfsmount *mounted = lookup_mnt(path);
if (mounted) {
dput(path->dentry);
if (need_mntput)
mntput(path->mnt);
path->mnt = mounted;
path->dentry = dget(mounted->mnt_root);
need_mntput = true;
continue;
}
/* Something is mounted on this dentry in another
* namespace and/or whatever was mounted there in this
* namespace got unmounted before we managed to get the
* vfsmount_lock */
}
/* Handle an automount point */
if (managed & DCACHE_NEED_AUTOMOUNT) {
ret = follow_automount(path, flags, &need_mntput);
if (ret < 0)
break;
continue;
}
/* We didn't change the current path point */
break;
}
if (need_mntput && path->mnt == mnt)
mntput(path->mnt);
if (ret == -EISDIR)
ret = 0;
return ret < 0 ? ret : need_mntput;
}
返回到walk_component函數,檢查inode是不是NULL,如果是則終止搜索。
調用path_to_nameidata函數,path_to_nameidata函數將path的內容轉化到nd裏面去。將找到的inode節點賦給nd->inode。
static inline void path_to_nameidata(const struct path *path,
struct nameidata *nd)
{
if (!(nd->flags & LOOKUP_RCU)) {
dput(nd->path.dentry);
if (nd->path.mnt != path->mnt)
mntput(nd->path.mnt);
}
nd->path.mnt = path->mnt;
nd->path.dentry = path->dentry;
}
返回到link_path_walk函數,繼續循環搜索下一個目錄項。若發現是最後一個目錄項,就將struct qstr this賦給nd->last, 然後直接返回到path_lookupat函數。
如果link_path_walk返回0,調用complete_walk函數完成搜索 (fs/namei.c)。
static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;
if (nd->flags & LOOKUP_RCU) {
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
spin_lock(&dentry->d_lock);
if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
spin_unlock(&dentry->d_lock);
rcu_read_unlock();
br_read_unlock(vfsmount_lock);
return -ECHILD;
}
BUG_ON(nd->inode != dentry->d_inode);
spin_unlock(&dentry->d_lock);
mntget(nd->path.mnt);
rcu_read_unlock();
br_read_unlock(vfsmount_lock);
}
if (likely(!(nd->flags & LOOKUP_JUMPED)))
return 0;
if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
return 0;
if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
return 0;
/* Note: we do not d_invalidate() */
status = d_revalidate(dentry, nd);
if (status > 0)
return 0;
if (!status)
status = -ESTALE;
path_put(&nd->path);
return status;
}
層層返回到kern_path_create,調用lookup_hash函數 (fs/namei.c)進行最後一個目錄項的搜索。
static struct dentry *lookup_hash(struct nameidata *nd)
{
return __lookup_hash(&nd->last, nd->path.dentry, nd);
}
調用__lookup_hash函數進行搜索,__lookup_hash調用lookup_dcache搜索dcache,若找不到則lookup_real調用對應文件系統的lookup函數去查找。
static struct dentry *__lookup_hash(struct qstr *name,
struct dentry *base, struct nameidata *nd)
{
bool need_lookup;
struct dentry *dentry;
dentry = lookup_dcache(name, base, nd, &need_lookup);
if (!need_lookup)
return dentry;
return lookup_real(base->d_inode, dentry, nd);
}
調用d_lookup搜索。
static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
struct nameidata *nd, bool *need_lookup)
{
struct dentry *dentry;
int error;
*need_lookup = false;
dentry = d_lookup(dir, name);
if (dentry) {
if (d_need_lookup(dentry)) {
*need_lookup = true;
} else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
error = d_revalidate(dentry, nd);
if (unlikely(error <= 0)) {
if (error < 0) {
dput(dentry);
return ERR_PTR(error);
} else if (!d_invalidate(dentry)) {
dput(dentry);
dentry = NULL;
}
}
}
}
if (!dentry) {
dentry = d_alloc(dir, name);
if (unlikely(!dentry))
return ERR_PTR(-ENOMEM);
*need_lookup = true;
}
return dentry;
}
d_lookup進一步調用__d_lookup。__d_lookup上面已經分析過了,就是在內存的dentry_hashtable表中查找dentry結構。這裏/dev下的root還沒有創建,所以肯定找不到,返回NULL。層層返回到lookup_dcache,如果d_lookup返回的dentry爲空,則調用d_alloc分配一個dentry對象。
struct dentry *d_lookup(struct dentry *parent, struct qstr *name)
{
struct dentry *dentry;
unsigned seq;
do {
seq = read_seqbegin(&rename_lock);
dentry = __d_lookup(parent, name);
if (dentry)
break;
} while (read_seqretry(&rename_lock, seq));
return dentry;
}
d_alloc: (fs/dcache.c)
調用__d_alloc分配一個dcache對象並初始化。設置dentry的父目錄,並將新的dentry鏈入其父目錄的d_subdirs鏈表中。
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
struct dentry *dentry = __d_alloc(parent->d_sb, name);
if (!dentry)
return NULL;
spin_lock(&parent->d_lock);
/*
* don't need child lock because it is not subject
* to concurrency here
*/
__dget_dlock(parent);
dentry->d_parent = parent;
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
spin_unlock(&parent->d_lock);
return dentry;
}
__d_alloc: (fs/dcache.c)
分配並初始化一個dcache對象。
struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
struct dentry *dentry;
char *dname;
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
if (!dentry)
return NULL;
if (name->len > DNAME_INLINE_LEN-1) {
dname = kmalloc(name->len + 1, GFP_KERNEL);
if (!dname) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
} else {
dname = dentry->d_iname;
}
dentry->d_name.name = dname;
dentry->d_name.len = name->len;
dentry->d_name.hash = name->hash;
memcpy(dname, name->name, name->len);
dname[name->len] = 0;
dentry->d_count = 1;
dentry->d_flags = 0;
spin_lock_init(&dentry->d_lock);
seqcount_init(&dentry->d_seq);
dentry->d_inode = NULL;
dentry->d_parent = dentry;
dentry->d_sb = sb;
dentry->d_op = NULL;
dentry->d_fsdata = NULL;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
INIT_LIST_HEAD(&dentry->d_alias);
INIT_LIST_HEAD(&dentry->d_u.d_child);
d_set_d_op(dentry, dentry->d_sb->s_d_op);
this_cpu_inc(nr_dentry);
printk("__d_alloc: dentry name: %s\n", dentry->d_name.name);
return dentry;
}
返回到lookup_dcache中,將need_lookup置爲真,向上返回到__lookup_hash,__lookup_hash最後調用lookup_real通過文件系統自己的lookup函數從設備讀目錄信息。
lookup_real: (fs/namei.c)
在緩存中無法找到指定的目錄項,那就創建該目錄項。通過底層文件系統的lookup函數從設備讀,lookup在讀目錄項的過程中,也把該目錄項對應的inode結構從磁盤讀出來,並賦值給dentry的d_inode字段。
static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct dentry *old;
/* Don't create child dentry for a dead directory. */
if (unlikely(IS_DEADDIR(dir))) {
dput(dentry);
return ERR_PTR(-ENOENT);
}
old = dir->i_op->lookup(dir, dentry, nd);
if (unlikely(old)) {
dput(dentry);
dentry = old;
}
return dentry;
}
simple_lookup: (fs/libfs.c)
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
static const struct dentry_operations simple_dentry_operations = {
.d_delete = simple_delete_dentry,
};
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
d_set_d_op(dentry, &simple_dentry_operations);
d_add(dentry, NULL);
return NULL;
}
d_set_d_op: (fs/dcache.c)
用simple_dentry_operations初始化dentry->d_op。
void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
WARN_ON_ONCE(dentry->d_op);
WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
DCACHE_OP_COMPARE |
DCACHE_OP_REVALIDATE |
DCACHE_OP_DELETE ));
dentry->d_op = op;
if (!op)
return;
if (op->d_hash)
dentry->d_flags |= DCACHE_OP_HASH;
if (op->d_compare)
dentry->d_flags |= DCACHE_OP_COMPARE;
if (op->d_revalidate)
dentry->d_flags |= DCACHE_OP_REVALIDATE;
if (op->d_delete)
dentry->d_flags |= DCACHE_OP_DELETE;
if (op->d_prune)
dentry->d_flags |= DCACHE_OP_PRUNE;
}
d_add: (include/linux/cache.h)
傳入的inode參數爲NULL。d_instantiate函數將dentry->d_inode置爲NULL。
d_rehash將dentry加入hash表。
static inline void d_add(struct dentry *entry, struct inode *inode)
{
d_instantiate(entry, inode);
d_rehash(entry);
}
d_rehash: (fs/dcache.c)
void d_rehash(struct dentry * entry)
{
spin_lock(&entry->d_lock);
_d_rehash(entry);
spin_unlock(&entry->d_lock);
}
_d_rehash: (fs/dcache.c)
static void _d_rehash(struct dentry * entry)
{
__d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
}
__d_rehash: (fs/dcache.c)
通過hlist_bl_add_head_rcu將dentry鏈入dentry_hashtable表。
static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
{
BUG_ON(!d_unhashed(entry));
hlist_bl_lock(b);
entry->d_flags |= DCACHE_RCUACCESS;
hlist_bl_add_head_rcu(&entry->d_hash, b);
hlist_bl_unlock(b);
}
hlist_bl_add_head_rcu: (include/linux/rculist_bl.h)
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
struct hlist_bl_head *h)
{
struct hlist_bl_node *first;
/* don't need hlist_bl_first_rcu because we're under lock */
first = hlist_bl_first(h);
n->next = first;
if (first)
first->pprev = &n->next;
n->pprev = &h->first;
/* need _rcu because we can have concurrent lock free readers */
hlist_bl_set_first_rcu(h, n);
}
lookup_real返回後,向上一直返回到kern_path_create函數,接着kern_path_create函數檢查返回的dentry->d_inode是否不空,如果不空表示inode節點已經存在了,不需要創建,本例中/dev下的root節點是不存在的(需要創建),因此將nd.path賦值給path後,直接返回dentry。
返回到sys_mknodat函數,執行mnt_want_write,這個函數告訴文件系統將要執行寫操作了,如果不可寫,將會返回錯誤。
如果可寫,調用vfs_mknod函數創建inode節點。
vfs_mknod : (fs/namei.c)
這個函數其實就是調用了要創建節點的父目錄的inode節點的方法i_op->mknod。
int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
int error = may_create(dir, dentry);
if (error)
return error;
if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
!ns_capable(inode_userns(dir), CAP_MKNOD))
return -EPERM;
if (!dir->i_op->mknod)
return -EPERM;
error = devcgroup_inode_mknod(mode, dev);
if (error)
return error;
error = security_inode_mknod(dir, dentry, mode, dev);
if (error)
return error;
error = dir->i_op->mknod(dir, dentry, mode, dev);
if (!error)
fsnotify_create(dir, dentry);
return error;
}
inode節點的方法是創建inode節點時初始化的,這裏也就是/dev節點創建時初始化的,dev節點是在rootfs虛擬文件系統下創建的(前面說過rootfs虛擬文件系統是linux kernel真正意義上掛載的第一個根文件系統),它的i_op被賦值爲ramfs_dir_inode_operations。
(fs/ramfs/inode.c)
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
.link = simple_link,
.unlink = simple_unlink,
.symlink = ramfs_symlink,
.mkdir = ramfs_mkdir,
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
};
所以i_op->mknod方法就是調用的ramfs_mknod函數。
ramfs_mknod: (fs/ramfs/inode.c)
ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
int error = -ENOSPC;
if (inode) {
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
}
return error;
}
ramfs_mknod函數調用了ramfs_get_inode函數創建一個新的inode。
ramfs_get_inode: (fs/ramfs/inode.c)
struct inode *ramfs_get_inode(struct super_block *sb,
const struct inode *dir, umode_t mode, dev_t dev)
{
struct inode * inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
break;
case S_IFREG:
inode->i_op = &ramfs_file_inode_operations;
inode->i_fop = &ramfs_file_operations;
break;
case S_IFDIR:
inode->i_op = &ramfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
break;
}
}
return inode;
}
調用new_inode分配一個新inode。
new_inode: (fs/inode.c)
new_inode進一步調用new_inode_pseudo函數新建inode。如果創建成功,則將其加入到該文件系統超級塊的inode鏈表。
struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
spin_lock_prefetch(&inode_sb_list_lock);
inode = new_inode_pseudo(sb);
if (inode)
inode_sb_list_add(inode);
return inode;
}
new_inode_pseudo: (fs/inode.c)
pseudo是“虛擬”的意思,因爲當前是在虛擬根文件系統————rootfs文件系統下創建節點,前面提過rootfs文件系統是基於內存的特殊文件系統,它的inode是隻存在於內存中的。而其它像ext, fat等文件系統的inode節點都存儲在硬盤等外部設備中。
new_inode_pseudo調用alloc_inode創建inode。
struct inode *new_inode_pseudo(struct super_block *sb)
{
struct inode *inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
INIT_LIST_HEAD(&inode->i_sb_list);
}
return inode;
}
alloc_inode: (fs/inode.c)
如果對應文件系統超級塊的s_op->alloc_inode存在,則調用它,否則調用kmem_cache_alloc函數分配inode。
static struct inode *alloc_inode(struct super_block *sb)
{
struct inode *inode;
if (sb->s_op->alloc_inode)
inode = sb->s_op->alloc_inode(sb);
else
inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
if (!inode)
return NULL;
if (unlikely(inode_init_always(sb, inode))) {
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
else
kmem_cache_free(inode_cachep, inode);
return NULL;
}
return inode;
}
在掛載rootfs虛擬根文件系統的時候,對應超級塊的s_op已經被設置爲ramfs_ops了。可以看到,alloc_inode沒有設置,所以調用kmem_cache_alloc函數來分配inode。
static const struct super_operations ramfs_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = generic_show_options,
};
inode分配好後就將它返回new_inode_pseudo,再返回到new_inode,接着new_inode函數調用inode_sb_list_add將新的inode加入到文件系統超級塊的inode鏈表中。
返回到ramfs_get_inode。
從new_inode返回後,進行一些初始化工作(包括inode的節點號),然後進入init_special_inode函數。
init_special_inode: (fs/inode.c)
前面create_dev的時候傳進來的參數mode是塊設備,所以這裏將inode節點操作函數設爲def_blk_fops,設備號設爲rdev。
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_mode = mode;
if (S_ISCHR(mode)) {
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
} else if (S_ISFIFO(mode))
inode->i_fop = &def_fifo_fops;
else if (S_ISSOCK(mode))
inode->i_fop = &bad_sock_fops;
else
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
inode->i_ino);
}
至此,inode節點就創建好了,向上層層返回到ramfs_get_inode,ramfs_get_inode就將創建好的inode節點直接返回到ramfs_mknod。
ramfs_mknod進一步調用d_instantiate。
d_instantiate: (fs/dcache.c)
d_instantiate函數封裝了__d_instantiate。
void d_instantiate(struct dentry *entry, struct inode * inode)
{
BUG_ON(!list_empty(&entry->d_alias));
if (inode)
spin_lock(&inode->i_lock);
__d_instantiate(entry, inode);
if (inode)
spin_unlock(&inode->i_lock);
security_d_instantiate(entry, inode);
}
__d_instantiate: (fs/dcache.c)
__d_instantiate將inode與dentry關聯起來。
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
spin_lock(&dentry->d_lock);
if (inode) {
if (unlikely(IS_AUTOMOUNT(inode)))
dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
list_add(&dentry->d_alias, &inode->i_dentry);
}
dentry->d_inode = inode;
dentry_rcuwalk_barrier(dentry);
spin_unlock(&dentry->d_lock);
fsnotify_d_instantiate(dentry, inode);
}
層層返回,至此,/dev/root設備就創建完畢了。