Linux open系統調用流程(3)

1. 閒言少敘,繼續分析__link_path_walk函數:

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
/**
處理三種情形:
(1)正在解析路徑名
(2)解析父目錄
(3)解析符號鏈接(第一次找出符號鏈接對應的文件路徑,第二次解析文件路徑)
**/
static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
{
	struct path next;
	struct inode *inode;
	int err;
	/*查詢標誌*/
	unsigned int lookup_flags = nd->flags;
	/*如果第一個字符爲/*/
	while (*name=='/')
		name++;
	/*只有一個根*/
	if (!*name)
		goto return_reval;
	/*得到索引節點,第一次是開始目錄的索引節點,以後就是上一次目錄的索引節點*/
	inode = nd->dentry->d_inode;
	/*設置符號鏈接*/
	if (nd->depth)
		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);

	/* At this point we know we have a real path component. */
	for(;;) {
		/*hash值*/
		unsigned long hash;
		/*包括hash值,分量長度和分量名*/
		struct qstr this;
		unsigned int c;
		/*設置繼續查詢標誌*/
		nd->flags |= LOOKUP_CONTINUE;
		/*檢查權限信息,如果一個目錄能夠被遍歷,首先必須具有執行權限*/
		err = exec_permission_lite(inode, nd);
		if (err == -EAGAIN)
			err = vfs_permission(nd, MAY_EXEC);
 		if (err)
			break;
		/*name指的是第一個分量的第一個字符的地址*/
		this.name = name;
		/*取得第一個字符,如/proc,那麼c='p'*/
		c = *(const unsigned char *)name;
		/*初始化hash值*/
		hash = init_name_hash();
		do {
			name++;
		/*計算部分hash,直到結尾,如/proc,那麼計算的hash值就是proc*/	
			hash = partial_name_hash(c, hash);
			c = *(const unsigned char *)name;
		} while (c && (c != '/'));
		/*計算每個分量的長度*/
		this.len = name - (const char *) this.name;
		/*this.hash賦上hash值*/
		this.hash = end_name_hash(hash);

		/* remove trailing slashes? */
		/*到達最後一個分量*/
		if (!c)
			goto last_component;
		while (*++name == '/');
		/*最後一個字符是/*/
		if (!*name)
			goto last_with_slashes;

		/*
		 * "." and ".." are special - ".." especially so because it has
		 * to be able to know about the current root directory and
		 * parent relationships.
		 */
		/*如果分量名第一個是.*/
		if (this.name[0] == '.') switch (this.len) {
			default:
				break;
			case 2:	/*並且第二個字符不是.,那麼可能是隱藏文件,即不影響*/
				if (this.name[1] != '.')
					break;
				/*如果第二個字符也是.,需要回溯到父目錄*/
				follow_dotdot(nd);
				inode = nd->dentry->d_inode;
				/* fallthrough */
			case 1:
				continue;
		}
		/*
		 * See if the low-level filesystem might want
		 * to use its own hash..
		 如果底層文件系統具有計算hash值的函數,則使用
		 */
		if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
			err = nd->dentry->d_op->d_hash(nd->dentry, &this);
			if (err < 0)
				break;
		}
		/* This does the actual lookups..真正的查找函數*/
		/*nd結構體,this包含了分量名,next指向分量的目錄項對象和安裝點對象*/
		err = do_lookup(nd, &this, &next);
		if (err)
			break;

		err = -ENOENT;
		/*上一次解析分量的索引節點對象*/
		inode = next.dentry->d_inode;
		if (!inode)
			goto out_dput;
		err = -ENOTDIR; 
		if (!inode->i_op)
			goto out_dput;
		/*處理符號鏈接*/
		if (inode->i_op->follow_link) {
			/*處理符號鏈接*/
			err = do_follow_link(&next, nd);
			if (err)
				goto return_err;
			err = -ENOENT;
			inode = nd->dentry->d_inode;
			if (!inode)
				break;
			err = -ENOTDIR; 
			if (!inode->i_op)
				break;
		} else
			/*將目錄項對象和安裝點對象賦值給nd*/
			path_to_nameidata(&next, nd);
		err = -ENOTDIR; 
		if (!inode->i_op->lookup)/*如果不是目錄*/
			break;
		continue;
		/* here ends the main loop */

last_with_slashes:
		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
last_component:
		/* Clear LOOKUP_CONTINUE iff it was previously unset 解析到最後一項,清除掉LOOKUP_CONTINUE*/
		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
		/*有些情況下,不需要找到最後一個分量,例如創建一個文件/foo/bar,此時bar文件不存在,則應該找到foo的目錄項對象*/
		if (lookup_flags & LOOKUP_PARENT)
			goto lookup_parent;
		if (this.name[0] == '.') switch (this.len) {
			default:
				break;
			case 2:	
				if (this.name[1] != '.')
					break;
				follow_dotdot(nd);
				inode = nd->dentry->d_inode;
				/* fallthrough */
			case 1:
				goto return_reval;
		}
		/*如果底層文件系統定義了計算hash值的方法,則使用它*/
		if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
			err = nd->dentry->d_op->d_hash(nd->dentry, &this);
			if (err < 0)
				break;
		}
		/*查詢最後一個component的hash值*/
		err = do_lookup(nd, &this, &next);
		if (err)
			break;
		/*最後一個分量的索引節點*/
		inode = next.dentry->d_inode;
		if ((lookup_flags & LOOKUP_FOLLOW)/*如果是符號鏈接*/
		    && inode && inode->i_op && inode->i_op->follow_link) {
			err = do_follow_link(&next, nd);
			if (err)
				goto return_err;
			inode = nd->dentry->d_inode;
		} else
			/*設置nameidata的mnt和dentry*/
			path_to_nameidata(&next, nd);
		err = -ENOENT;
		if (!inode)/*如果索引節點爲空,即文件不存在*/
			break;
		if (lookup_flags & LOOKUP_DIRECTORY) {/*如果是目錄*/
			err = -ENOTDIR; 
			if (!inode->i_op || !inode->i_op->lookup)/*如果沒有目錄方法*/
				break;
		}
		goto return_base;/*正常返回0,則nd包含了最後一個分量的目錄項對象和所屬的文件系統安裝點*/
lookup_parent:/*創建一個文件時需要父目錄項對象*/
		/*最後一個分量名*/
		nd->last = this;
		/*最後一個分量類型*/
		nd->last_type = LAST_NORM;
		/*不是.代表文件*/
		if (this.name[0] != '.')
			goto return_base;
		/*如果長度爲1,代表當前目錄*/
		if (this.len == 1)
			nd->last_type = LAST_DOT;
		/*長度爲2,代表父目錄*/
		else if (this.len == 2 && this.name[1] == '.')
			nd->last_type = LAST_DOTDOT;
		else
			goto return_base;
return_reval:
		/*
		 * We bypassed the ordinary revalidation routines.
		 * We may need to check the cached dentry for staleness.
		 */
		if (nd->dentry && nd->dentry->d_sb &&
		    (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
			err = -ESTALE;
			/* Note: we do not d_invalidate() */
			if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
				break;
		}
return_base:
		return 0;
out_dput:
		dput_path(&next, nd);
		break;
	}
	path_release(nd);
return_err:
	return err;
}

這個函數主要做三件事:

(1)解析已經存在的文件路徑,即打開標誌

(2)解析不存在的文件路徑,即創建文件標誌,這樣,需要得到父目錄項對象和安裝點對象

(3)解析符號鏈接,第一次找到符號鏈接的文件路徑,第二次解析路徑名

第23-26行,只有/,跳至return_reval. 這裏多個根當作一個根處理,如//

第31-32行,設置符號鏈接標誌。

第39行,定義qstr結構,這個結構包括hash值,分量長度和分量名。

第43-46行,進行權限檢查,遍厙目錄,必須具有執行權限。

第55-60行,計算每個分量的hash值。

第68行,如果解析到最後一個分量,跳至last_component.

第72行,如果遇到類似/proc/的目錄,跳至last_with_slashes.

第80行,如果分量的第一個字符是.,但第二個字符不是.,則正常解析。

第88行,當第二個字符也是. ,說明是父目錄,調用follow_dotdot進行回溯。

我們分析一下這個函數:

static __always_inline void follow_dotdot(struct nameidata *nd)
{
	/*得到fs_struct結構體*/
	struct fs_struct *fs = current->fs;

	while(1) {
		struct vfsmount *parent;
		/*上一次的目錄項對象*/
		struct dentry *old = nd->dentry;
                read_lock(&fs->lock);
		/*如果回溯的目錄是進程的根目錄,則不允許,調用follow_mount函數*/
		if (nd->dentry == fs->root &&
		    nd->mnt == fs->rootmnt) {
                        read_unlock(&fs->lock);
			break;
		}
                read_unlock(&fs->lock);
		spin_lock(&dcache_lock);
		/*如果目錄項對象不是根目錄,則返回上一級目錄項對象*/
		if (nd->dentry != nd->mnt->mnt_root) {
			nd->dentry = dget(nd->dentry->d_parent);
			spin_unlock(&dcache_lock);
			dput(old);
			break;
		}
		spin_unlock(&dcache_lock);
		spin_lock(&vfsmount_lock);
		parent = nd->mnt->mnt_parent;
		if (parent == nd->mnt) {
			spin_unlock(&vfsmount_lock);
			break;
		}
		mntget(parent);
		nd->dentry = dget(nd->mnt->mnt_mountpoint);
		spin_unlock(&vfsmount_lock);
		dput(old);
		mntput(nd->mnt);
		nd->mnt = parent;
	}
	/*回溯到最底層的文件系統,nd->mnt指向掛載點*/
	follow_mount(&nd->mnt, &nd->dentry);
}

第11-16行,如果回溯的是進程的根目錄,則不允許,調用follow_mount函數。

第19-23行,如果目錄項對象不是根目錄,則通過nd->dentry=dget(nd->dentry->d_parent)返回上一級目錄項對象。

不管怎麼樣,最終會調用follow_mount函數。有時,人的好奇心是很強的,同樣,對於Linux內核源碼,也需要好奇心。哈哈,看一下follow_mount函數:

/*一直回溯到沒有掛載其它文件系統的掛載點,mnt指向這個最底層的掛載點*/
static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{
	while (d_mountpoint(*dentry)) {
		/*返回子掛載點*/
		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
		if (!mounted)
			break;
		dput(*dentry);
		mntput(*mnt);
		*mnt = mounted;
		*dentry = dget(mounted->mnt_root);
	}
}
這個函數首先判斷一下dentry目錄項是不是掛載點,如果是,調用lookup_mnt函數返回子掛載點。在第11行,將mnt賦值mounted,接着,尋找子掛載點。最終,找到一個沒有其它文件系統安裝在其之上的文件系統掛載點。這裏,需要解釋一下,如果/dev/sda1和/dev/sda2先後掛載在/usr目錄下,那麼/dev/sda1的相關目錄將會被隱藏,而/dev/sda2的父掛載點是/dev/sda1. 而上面的過程是通過父掛載點找子掛載點,直到找到一個沒有掛載其它文件系統的掛載點爲止。這個,文件系統稱暫且稱爲底層文件系統。也不知道,這麼叫對不對,或許是頂層文件系統。總之,follow_dotdot回溯到了上一級目錄。

接着__link_path_walk解釋,

第97行,如果底層文件系統具有計算hash值的函數,則調用。

第106行,查找分量的目錄項對象函數do_lookup,這個函數一會分析。

第119行,判斷是否是符號鏈接,調用do_follow_link處理符號鏈接,稍後分析。

第142行,處理最後一個分量。

第167行,調用do_lookup函數,找到一個最後分量的目錄項對象和掛載點對象。

第172行,如果最後一個分量是符號鏈接,調用do_follow_link進一步處理。

第190行,當只是建立文件時,跳至lookup_parent.

第192-205行,最後一個分量名和分量類型,此時,nd保存了上一個分量的目錄項對象和掛載點對象。

如果正確解析,返回0.

下面,分析一下do_lookup函數:

/* 查詢目錄項對象,其結果保存在nameidata中,如果目錄項緩存中存在,則直接返回,否則,創建目錄項對象並插入目錄項緩存,在創建索引節點,插入索引節點緩存(inode cache),然後讓ndr dentry與mtn分別指向目錄項對象和分量名所屬的文件系統的安裝點對象
 傳入參數:nd,name指分量名
 *  It's more convoluted than I'd like it to be, but... it's still fairly
 *  small and for now I'd prefer to have fast path as straight as possible.
 *  It _is_ time-critical.
 */
static int do_lookup(struct nameidata *nd, struct qstr *name,
		     struct path *path)
{
	struct vfsmount *mnt = nd->mnt;
	/*首先在目錄項緩存查找,如果沒有,則從底層建立目錄項對象*/
	struct dentry *dentry = __d_lookup(nd->dentry, name);
	/*如果目錄項緩存不存在*/
	if (!dentry)
		goto need_lookup;
	if (dentry->d_op && dentry->d_op->d_revalidate)
		goto need_revalidate;
done:
	path->mnt = mnt;/*安裝點對象*/
	path->dentry = dentry;/*目錄項對象*/
	/*找到子掛載點的mnt和目錄項對象,即最底層的文件系統掛載點對象*/
	__follow_mount(path);
	return 0;

need_lookup:
	/*如果dentry cache沒有,則在內存分配一個dentry,並在內存分配索引節點,將dentry和索引節點關聯*/
	dentry = real_lookup(nd->dentry, name, nd);
	if (IS_ERR(dentry))
		goto fail;
	goto done;

need_revalidate:
	/*驗證目錄項對象是否還有效*/
	dentry = do_revalidate(dentry, nd);
	if (!dentry)
		goto need_lookup;
	if (IS_ERR(dentry))
		goto fail;
	goto done;

fail:
	return PTR_ERR(dentry);
}

這個函數的主要功能是查詢目錄項對象,並將掛載點和目錄項對象保存在nameidata結構。具體如下:

第10行,nd保存了上一個目錄項對象和掛載點對象。

第12行,首先在目錄項緩存dentry cache查找,如果緩存不存在,跳轉到need_lookup,調用real_lookup在內存分配一個dentry,並將dentry和索引節點關聯。

第17行,如果存在,需要驗證目錄項對象是否有效,跳至34行,如果有效,將mnt和dentry賦值給path. 在__link_path_walk會將path值賦給nd.

繼續跟蹤__do_lookup函數:

//從目錄項緩存查找相應的目錄項對象即struct dentry
struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
{
	unsigned int len = name->len;/*分量名的長度*/
	unsigned int hash = name->hash;/*分量名的hash值*/
	const unsigned char *str = name->name;/*分量名*/
	struct hlist_head *head = d_hash(parent,hash);/*得到hash節點指針*/
	struct dentry *found = NULL;
	struct hlist_node *node;
	struct dentry *dentry;

	rcu_read_lock();
	/*dentry cache查找*/
	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
		struct qstr *qstr;
		/*hash值是否相同,hash值和名稱相關聯*/
		if (dentry->d_name.hash != hash)
			continue;
		/*父目錄項是否是parent*/
		if (dentry->d_parent != parent)
			continue;

		spin_lock(&dentry->d_lock);

		/*
		 * Recheck the dentry after taking the lock - d_move may have
		 * changed things.  Don't bother checking the hash because we're
		 * about to compare the whole name anyway.
		 */
		if (dentry->d_parent != parent)
			goto next;

		/*
		 * It is safe to compare names since d_move() cannot
		 * change the qstr (protected by d_lock).
		 */
		/*detnry->d_name表示分量名,長度*/
		qstr = &dentry->d_name;
		if (parent->d_op && parent->d_op->d_compare) {/*匹配分量名,不同文件系統可以有不同的實現,如MS-DOS不分大小寫*/
			if (parent->d_op->d_compare(parent, qstr, name))
				goto next;
		} else {
			if (qstr->len != len)
				goto next;
			if (memcmp(qstr->name, str, len))
				goto next;
		}
		
		if (!d_unhashed(dentry)) {
			atomic_inc(&dentry->d_count);
			found = dentry;
		}
		spin_unlock(&dentry->d_lock);
		break;
next:
		spin_unlock(&dentry->d_lock);
 	}
 	rcu_read_unlock();

 	return found;
}
第4-7行,賦值len,hash和name,並取得head指針,爲下面比較做準備。

第14行,判斷hash值是是否相同。

第20行,判斷父目錄項parent是否相同。

第39行,匹配分量名。

如果找到,返回目錄項對象。

從這個查找過程,可以看出,是用目錄名或是文件名計算hash值,然後返回對應的目錄項對象。這也是爲什麼目錄名或文件名不放在索引節點而放在目錄項對象的原因。

如果目錄項緩存沒有,繼續跟蹤real_lookup函數:

/*
 * This is called when everything else fails, and we actually have
 * to go to the low-level filesystem to find out what we should do..
 *
 * We get the directory semaphore, and after getting that we also
 * make sure that nobody added the entry to the dcache in the meantime..
 * SMP-safe
返回目錄項對象
 */
static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
{
	struct dentry * result;
	/*上一級的inode節點*/
	struct inode *dir = parent->d_inode;
	mutex_lock(&dir->i_mutex);
	/*
	 * First re-do the cached lookup just in case it was created
	 * while we waited for the directory semaphore..
	 *
	 * FIXME! This could use version numbering or similar to
	 * avoid unnecessary cache lookups.
	 *
	 * The "dcache_lock" is purely to protect the RCU list walker
	 * from concurrent renames at this point (we mustn't get false
	 * negatives from the RCU list walk here, unlike the optimistic
	 * fast walk).
	 *
	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
	 */
	/*重新搜索一下目錄項緩存*/
	result = d_lookup(parent, name);
	if (!result) {/*如果沒有*/
	/*分配一個目錄項對象,並初始化,對應分量的目錄項對象的父目錄項對象設置爲上一次解析出來的目錄項對象,即nd->dentry*/
		struct dentry * dentry = d_alloc(parent, name);
		result = ERR_PTR(-ENOMEM);
		if (dentry) {
			/*具體的文件系統相關函數,讀取磁盤的inode節點信息,並將inode節點和目錄項對象相關聯,在iget索引節點時,將索引節點加入了inode cache,在關聯inode節點時,將目錄項對象加入了dentry cache*/
			result = dir->i_op->lookup(dir, dentry, nd);
			if (result)
				dput(dentry);
			else
				result = dentry;
		}
		mutex_unlock(&dir->i_mutex);
		return result;
	}

	/*
	 * Uhhuh! Nasty case: the cache was re-populated while
	 * we waited on the semaphore. Need to revalidate.
	 */
	mutex_unlock(&dir->i_mutex);
	if (result->d_op && result->d_op->d_revalidate) {
		result = do_revalidate(result, nd);
		if (!result)
			result = ERR_PTR(-ENOENT);
	}
	return result;
}

在第33行,重新搜索一下目錄項緩存,由於進程在查找過程中可能阻塞,在這期間,目錄項可能已經加入了dentry cache,所以需要重新查找一下。

第34行,如果沒有找到,調用d_alloc分配一個目錄項對象。

第35行,具體的文件系統索引節點查找函數,讀取磁盤索引節點信息,並將索引節點和目錄項對象關聯。在iget索引節點時,將索引節點加入了inode cache. 在關聯inode節點時,將目錄項對象加入了dentry cache.

在第53行,驗證目錄項對象是否有效,最終返回目錄項對象。

可以看到,此時返回的目錄項對象已經加入到了dentry cache,並關聯了索引節點。即dentry->d_innode=inode.

我們繼續跟蹤上面的兩個函數,首先跟蹤d_alloc函數:

/**分配一個目錄項對象,並初始化
 * d_alloc	-	allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
	struct dentry *dentry;
	char *dname;

	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); 
	if (!dentry)
		return NULL;

	if (name->len > DNAME_INLINE_LEN-1) {
		dname = kmalloc(name->len + 1, GFP_KERNEL);
		if (!dname) {
			kmem_cache_free(dentry_cache, dentry); 
			return NULL;
		}
	} else  {
		dname = dentry->d_iname;
	}	
	dentry->d_name.name = dname;

	dentry->d_name.len = name->len;
	dentry->d_name.hash = name->hash;
	memcpy(dname, name->name, name->len);
	dname[name->len] = 0;

	atomic_set(&dentry->d_count, 1);
	dentry->d_flags = DCACHE_UNHASHED;
	spin_lock_init(&dentry->d_lock);
	dentry->d_inode = NULL;
	dentry->d_parent = NULL;
	dentry->d_sb = NULL;
	dentry->d_op = NULL;
	dentry->d_fsdata = NULL;
	dentry->d_mounted = 0;
#ifdef CONFIG_PROFILING
	dentry->d_cookie = NULL;
#endif
	INIT_HLIST_NODE(&dentry->d_hash);
	INIT_LIST_HEAD(&dentry->d_lru);
	INIT_LIST_HEAD(&dentry->d_subdirs);
	INIT_LIST_HEAD(&dentry->d_alias);

	if (parent) {
		/*設置父目錄項對象爲parent*/
		dentry->d_parent = dget(parent);
		/*目錄項對象對應的超級塊對象*/
		dentry->d_sb = parent->d_sb;
	} else {
		INIT_LIST_HEAD(&dentry->d_u.d_child);
	}

	spin_lock(&dcache_lock);
	if (parent)
		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
	dentry_stat.nr_dentry++;
	spin_unlock(&dcache_lock);

	return dentry;
}

第16行,爲目錄項對象分配內存。

第29-32行,設置名稱,長度和hash值。

第48-51行,初始化相關鏈表。

第53行,如果父目錄項對象存在,就設置父目錄項對象和超級塊對象。這樣,就建立了一個子目錄項對象。


接着跟蹤lookup函數,以ext3爲例,ext3_lookup:

/*查找文件名在目錄項對象dentry下的inode節點*/
static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
{
	struct inode * inode;
	struct ext3_dir_entry_2 * de;
	struct buffer_head * bh;

	if (dentry->d_name.len > EXT3_NAME_LEN)
		return ERR_PTR(-ENAMETOOLONG);
	/*得到ext3_dir_entry_2對象,該對象包含inode節點號,再根據inode節點後從超級塊的read_inode得到inode結構體*/
	bh = ext3_find_entry(dentry, &de);
	inode = NULL;
	if (bh) {
		unsigned long ino = le32_to_cpu(de->inode);
		brelse (bh);
		if (!ext3_valid_inum(dir->i_sb, ino)) {
			ext3_error(dir->i_sb, "ext3_lookup",
				   "bad inode number: %lu", ino);
			inode = NULL;
		} else
			/*創建內存索引節點,並填充相關信息,i_fop,並將索引節點加入inode cache*/
			inode = iget(dir->i_sb, ino);

		if (!inode)
			return ERR_PTR(-EACCES);
	}
	/*將目錄項對象關聯inode節點*/
	return d_splice_alias(inode, dentry);
}

第11行,得到ext3_dir_entry_2對象,該對象包含了索引節點號。

第13-16行,判斷索引節點號是否合法。

第21行,創建內存索引節點,並填充相關信息,將索引節點加入inode cache.

第28行,將目錄項對象和索引節點關聯。

首先,跟蹤iget函數:

 

static inline struct inode *iget(struct super_block *sb, unsigned long ino)
{
	/*在內存分配一個新的索引節點*/
	struct inode *inode = iget_locked(sb, ino);
	/*如果是一個新的索引節點,讀取磁盤上的索引節點並填充內存索引節點的相關信息*/
	if (inode && (inode->i_state & I_NEW)) {
		sb->s_op->read_inode(inode);
		unlock_new_inode(inode);
	}

	return inode;
}
首先調用iget_locked分配內存索引節點。如果是新分配的,需要調用read_inode調用磁盤上的索引節點填充相關信息。

繼續跟蹤iget_locked函數:

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:		super block of file system
 * @ino:	inode number to get
 *
 * This is iget() without the read_inode() portion of get_new_inode_fast().
 *
 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
 * the inode cache and if present it is returned with an increased reference
 * count. This is for file systems where the inode number is sufficient for
 * unique identification of an inode.
 *
 * If the inode is not in cache, get_new_inode_fast() is called to allocate a
 * new inode and this is returned locked, hashed, and with the I_NEW flag set.
 * The file system gets to fill it in before unlocking it via
 * unlock_new_inode().
 */
/**
這個函數首先在inode節點緩存查找inode節點,如果存在,則返回
如果緩存不存在,調用get_new_inode_fast分配一個inode節點
**/
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
	/*inode_hashtable查找*/
	struct hlist_head *head = inode_hashtable + hash(sb, ino);
	struct inode *inode;
	/*首先在inode cache查找*/
	inode = ifind_fast(sb, head, ino);
	if (inode)
		return inode;
	/*
	 * get_new_inode_fast() will do the right thing, re-trying the search
	 * in case it had to block at any point.
	 */
	/*新分配一個索引節點,並加入到inode cache,即inode_hashtable*/
	return get_new_inode_fast(sb, head, ino);
}

第28行,在inode cache查找,如果沒有,調用get_new_inode_fast分配一個索引節點並插入inode cache.

ifind_fast留給讀者自行分析吧!

分析一下,get_new_inode_fast函數:

/*
 * get_new_inode_fast is the fast path version of get_new_inode, see the
 * comment at iget_locked for details.
 */
static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
{
	struct inode * inode;
	/*分配一個索引節點*/
	inode = alloc_inode(sb);
	if (inode) {
		struct inode * old;

		spin_lock(&inode_lock);
		/* We released the lock, so.. */
		old = find_inode_fast(sb, head, ino);
		if (!old) {
			/*設置索引節點號*/
			inode->i_ino = ino;
			inodes_stat.nr_inodes++;
			/*加入已經使用鏈表inode_in_use*/
			list_add(&inode->i_list, &inode_in_use);
			/*加入超級塊鏈表*/
			list_add(&inode->i_sb_list, &sb->s_inodes);
			/*加入inode_hashtable*/
			hlist_add_head(&inode->i_hash, head);
			/*設置狀態*/
			inode->i_state = I_LOCK|I_NEW;
			spin_unlock(&inode_lock);

			/* Return the locked inode with I_NEW set, the
			 * caller is responsible for filling in the contents
			 */
			return inode;
		}

		/*
		 * Uhhuh, somebody else created the same inode under
		 * us. Use the old inode instead of the one we just
		 * allocated.
		 */
		__iget(old);
		spin_unlock(&inode_lock);
		destroy_inode(inode);
		inode = old;
		wait_on_inode(inode);
	}
	return inode;
}

第9行,分配索引節點。

第17-28行,索引節點的初始化。包括:

(1)設置索引節點號

(2)加入inode_in_use鏈表

(3)加入inode_hashtable,即加入inode cache

(4)設置狀態爲I_NEW

返回索引節點。

接下來,繼續分析iget函數中的第二個函數read_inode.

void ext3_read_inode(struct inode * inode)
{	/*描述索引節點的位置信息*/
	struct ext3_iloc iloc;
	struct ext3_inode *raw_inode;
	struct ext3_inode_info *ei = EXT3_I(inode);
	struct buffer_head *bh;
	int block;

#ifdef CONFIG_EXT3_FS_POSIX_ACL
	ei->i_acl = EXT3_ACL_NOT_CACHED;
	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
#endif
	ei->i_block_alloc_info = NULL;

	if (__ext3_get_inode_loc(inode, &iloc, 0))
		goto bad_inode;
	bh = iloc.bh;
	/*磁盤上原始索引節點,讀取它並填充新分配的索引節點*/
	raw_inode = ext3_raw_inode(&iloc);
	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
	if(!(test_opt (inode->i_sb, NO_UID32))) {
		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
	}
	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
	inode->i_size = le32_to_cpu(raw_inode->i_size);
	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;

	ei->i_state = 0;
	ei->i_dir_start_lookup = 0;
	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
	/* We now have enough fields to check if the inode was active or not.
	 * This is needed because nfsd might try to access dead inodes
	 * the test is that same one that e2fsck uses
	 * NeilBrown 1999oct15
	 */
	if (inode->i_nlink == 0) {
		if (inode->i_mode == 0 ||
		    !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
			/* this inode is deleted */
			brelse (bh);
			goto bad_inode;
		}
		/* The only unlinked inodes we let through here have
		 * valid i_mode and are being read by the orphan
		 * recovery code: that's fine, we're about to complete
		 * the process of deleting those. */
	}
	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
#ifdef EXT3_FRAGMENTS
	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
	ei->i_frag_no = raw_inode->i_frag;
	ei->i_frag_size = raw_inode->i_fsize;
#endif
	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
	if (!S_ISREG(inode->i_mode)) {
		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
	} else {
		inode->i_size |=
			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
	}
	ei->i_disksize = inode->i_size;
	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
	ei->i_block_group = iloc.block_group;
	/*
	 * NOTE! The in-memory inode i_data array is in little-endian order
	 * even on big-endian machines: we do NOT byteswap the block numbers!
	 */
	for (block = 0; block < EXT3_N_BLOCKS; block++)
		ei->i_data[block] = raw_inode->i_block[block];
	INIT_LIST_HEAD(&ei->i_orphan);

	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
	    EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
		/*
		 * When mke2fs creates big inodes it does not zero out
		 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
		 * so ignore those first few inodes.
		 */
		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
		if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
		    EXT3_INODE_SIZE(inode->i_sb))
			goto bad_inode;
		if (ei->i_extra_isize == 0) {
			/* The extra space is currently unused. Use it. */
			ei->i_extra_isize = sizeof(struct ext3_inode) -
					    EXT3_GOOD_OLD_INODE_SIZE;
		} else {
			__le32 *magic = (void *)raw_inode +
					EXT3_GOOD_OLD_INODE_SIZE +
					ei->i_extra_isize;
			if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
				 ei->i_state |= EXT3_STATE_XATTR;
		}
	} else
		ei->i_extra_isize = 0;

	if (S_ISREG(inode->i_mode)) {
		/*inode節點相關方法和文件操作方法,這個非常重要,最後將inode->i_fop賦給file對象*/
		inode->i_op = &ext3_file_inode_operations;
		inode->i_fop = &ext3_file_operations;
		ext3_set_aops(inode);
	} else if (S_ISDIR(inode->i_mode)) {
		inode->i_op = &ext3_dir_inode_operations;
		inode->i_fop = &ext3_dir_operations;
	} else if (S_ISLNK(inode->i_mode)) {
		if (ext3_inode_is_fast_symlink(inode))
			inode->i_op = &ext3_fast_symlink_inode_operations;
		else {
			inode->i_op = &ext3_symlink_inode_operations;
			ext3_set_aops(inode);
		}
	} else {//將相關操作賦值給inode->i_op
		inode->i_op = &ext3_special_inode_operations;
		if (raw_inode->i_block[0])
			init_special_inode(inode, inode->i_mode,
			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
		else
			init_special_inode(inode, inode->i_mode,
			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
	}
	brelse (iloc.bh);
	ext3_set_inode_flags(inode);
	return;

bad_inode:
	make_bad_inode(inode);
	return;
}

簡單說一下功能:

第19行,讀取磁盤上原始索引節點,用來填充新分配的索引節點。

第20-32行,inode相關域設置。

第104行,如果是文件,則將文件相關操作的指針賦給inode->i_fop,這非常重要,因爲,最後將i_fop賦給了文件對象file->f_op. 表示了文件的相關操作。

第109-111行,目錄相關操作。

第112-118行,符號鏈接相關操作。

第119-128行,設備相關操作。具體就不分析了。

到此爲止,我們已經得到了一個inode節點,並且填充了相關域。

iget函數返回,ext3_lookup繼續往下走,調用d_splice_alias函數:

/**將索引節點和目錄項對象相關聯
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
 * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
 * and return it, else simply d_add the inode to the dentry and return NULL.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
	struct dentry *new = NULL;

	if (inode && S_ISDIR(inode->i_mode)) {
		spin_lock(&dcache_lock);
		new = __d_find_alias(inode, 1);
		if (new) {
			BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
			fsnotify_d_instantiate(new, inode);
			spin_unlock(&dcache_lock);
			security_d_instantiate(new, inode);
			d_rehash(dentry);
			d_move(new, dentry);
			iput(inode);
		} else {
			/* d_instantiate takes dcache_lock, so we do it by hand */
			/*加入正在使用目錄項鍊表,即表頭在i_dentry*/
			list_add(&dentry->d_alias, &inode->i_dentry);
			/*目錄項對象和索引節點對象關聯*/
			dentry->d_inode = inode;
			fsnotify_d_instantiate(dentry, inode);
			spin_unlock(&dcache_lock);
			security_d_instantiate(dentry, inode);
			/*將目錄項對象加入dentry_hashtable即目錄項緩存*/
			d_rehash(dentry);
		}
	} else
		d_add(dentry, inode);
	return new;
}

第37行,將目錄項對象和索引節點相關聯。

第42行,將目錄項對象加入到目錄項緩存。

最後,返回dentry.

如果,你現在仍然很清醒,那麼恭喜你,你已經基本瞭解了整個過程。

lookup函數返回,在__link_path_walk函數調用path_to_nameidata將path->mnt和path->dentry賦給nd->mnt和nd->dentry.表示找到的目錄項對象和掛載點對象。

接下來,處理符號鏈接,調用do_follow_link函數:

/*
 * This limits recursive symlink follows to 8, while
 * limiting consecutive symlinks to 40.
 *
 * Without that kind of total limit, nasty chains of consecutive
 * symlinks can cause almost arbitrarily long lookups. 
 */
static inline int do_follow_link(struct path *path, struct nameidata *nd)
{
	int err = -ELOOP;
	if (current->link_count >= MAX_NESTED_LINKS)/*檢查符號鏈接數,如果軟鏈接不停的鏈接自己,可能導致內核棧溢出*/
		goto loop;
	/*表示總的符號鏈接數*/
	if (current->total_link_count >= 40)
		goto loop;
	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
	cond_resched();
	err = security_inode_follow_link(path->dentry, nd);
	if (err)
		goto loop;
	current->link_count++;/*增加鏈接數*/
	current->total_link_count++;
	nd->depth++;/*增加鏈接深度*/
	err = __do_follow_link(path, nd);
	current->link_count--;
	nd->depth--;
	return err;
loop:
	dput_path(path, nd);
	path_release(nd);
	return err;
}

這個函數首先松果符號鏈接數,不能超過MAX_NESTED_LINKS.

最終調用__do_follow_link進行處理。

static __always_inline int __do_follow_link(struct path *path, struct nameidata *nd)
{
	int error;
	void *cookie;
	struct dentry *dentry = path->dentry;

	touch_atime(path->mnt, dentry);/*更新inode節點的存取時間*/
	/*先將nd->saved_names數組置空*/
	nd_set_link(nd, NULL);
	if (path->mnt != nd->mnt) {
		path_to_nameidata(path, nd);
		dget(dentry);
	}
	mntget(path->mnt);
	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);/*提取存儲在符號鏈接的路徑,並保存在nd->saved_names數組*/
	error = PTR_ERR(cookie);
	if (!IS_ERR(cookie)) {
		/*路徑名放在s*/
		char *s = nd_get_link(nd);
		error = 0;
		if (s)
			error = __vfs_follow_link(nd, s);/*解析路徑名*/
		if (dentry->d_inode->i_op->put_link)
			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
	}
	dput(dentry);
	mntput(path->mnt);

	return error;
}

第15行,取出符號鏈接的路徑,放到nd->saved_names可以看出,符號鏈接有自己的inode節點,並且inode節點保存的內容是真正的文件路徑。所以,符號鏈接可以跨文件系統。

第22行,調用__vfs_follow_link解析路徑名。

/*按照符號鏈接保存的路徑名調用link_path_walk解析真正的鏈接*/
static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
{
	int res = 0;
	char *name;
	if (IS_ERR(link))
		goto fail;
	/*如果第一個字符是/,那麼從根開始查找*/
	if (*link == '/') {
		path_release(nd);
		if (!walk_init_root(link, nd))
			/* weird __emul_prefix() stuff did it */
			goto out;
	}
	res = link_path_walk(link, nd);
out:
	if (nd->depth || res || nd->last_type!=LAST_NORM)
		return res;
	/*
	 * If it is an iterative symlinks resolution in open_namei() we
	 * have to copy the last component. And all that crap because of
	 * bloody create() on broken symlinks. Furrfu...
	 */
	name = __getname();
	if (unlikely(!name)) {
		path_release(nd);
		return -ENOMEM;
	}
	strcpy(name, nd->last.name);
	nd->last.name = name;
	return 0;
fail:
	path_release(nd);
	return PTR_ERR(link);
}

第15行,調用link_path_walk. 看到這個函數,鬆了一口氣,因爲前面已經分析過了。

當__link_path_walk返回時,link_path_walk也跟着返回,之後do_path_lookup也返回了,最終回到open_namei函數。

如果是打開文件,返回即可。

如果是創建文件,還需調用open_namei_create函數:

static int open_namei_create(struct nameidata *nd, struct path *path,
				int flag, int mode)
{
	int error;
	struct dentry *dir = nd->dentry;

	if (!IS_POSIXACL(dir->d_inode))
		mode &= ~current->fs->umask;
	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
	mutex_unlock(&dir->d_inode->i_mutex);
	dput(nd->dentry);
	nd->dentry = path->dentry;/*更改nd目錄項對象指向新創建的文件*/
	if (error)
		return error;
	/* Don't check for write permission, don't truncate */
	return may_open(nd, 0, flag & ~O_TRUNC);
}
封裝了vfs_create函數:

int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
		struct nameidata *nd)
{
	int error = may_create(dir, dentry, nd);

	if (error)
		return error;

	if (!dir->i_op || !dir->i_op->create)
		return -EACCES;	/* shouldn't it be ENOSYS? */
	mode &= S_IALLUGO;
	mode |= S_IFREG;
	error = security_inode_create(dir, dentry, mode);
	if (error)
		return error;
	DQUOT_INIT(dir);
	error = dir->i_op->create(dir, dentry, mode, nd);
	if (!error)
		fsnotify_create(dir, dentry);
	return error;
}

調用inode的create方法創建索引節點。以ext3爲例,調用ext3_create函數:

/*已經創建了目錄項緩存對象,但是沒有創建索引節點對象
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
		struct nameidata *nd)
{
	handle_t *handle;
	struct inode * inode;
	int err, retries = 0;

retry:
	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
	if (IS_ERR(handle))
		return PTR_ERR(handle);

	if (IS_DIRSYNC(dir))
		handle->h_sync = 1;

	inode = ext3_new_inode (handle, dir, mode);
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
		inode->i_op = &ext3_file_inode_operations;
		inode->i_fop = &ext3_file_operations;
		ext3_set_aops(inode);
		err = ext3_add_nondir(handle, dentry, inode);
	}
	ext3_journal_stop(handle);
	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
		goto retry;
	return err;
}

第26行,創建索引節點。

第29-33行,inode->i_op和inode->i_fop賦值。

之後,還會將索引節點標識爲髒,需要回寫到磁盤上,具體實現就不分析了。

當open_namei函數返回時,open系統調用也就分析完了。


總結:

(1)建立一個struct file結構體,將nameidata相關域填充到這個結構體,最重要的兩個域mnt, dentry. 從dentry可得到inode,從而將i_fop賦給文件對象。

(2)在路徑查找時,通過父目錄項建立子目錄項,然後將子目錄項關聯inode節點。

(3)打開文件和建立文件不同。打開文件,只需要找到目錄項對象,然後關聯索引節點即可,因爲索引節點存在。而建立文件時,由於文件不存在,首先找到目錄的目錄項對象,然後建立子目錄項對象和索引節點對象,最後索引節點對象需要同步到磁盤上。

(4)有兩個緩存,dentry cache和inode cache,分別用來緩存目錄項對象和索引節點對象。

(5)將文件對象和進程的files_struct相關聯。

(6)對於普通文件,不需要打開操作,而對於設備文件,需要打開操作,例如SCSI設備的sg_open函數。

(7)主要處理三種情形:打開文件,建立文件和符號鏈接

參考文獻: <深入理解Linux內核第3版>




 

 

 


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章