在看initramfs方式啓動linux的時候,需要把文件解壓到rootfs文件系統中。比較困惑這些文件是如何寫到rootfs文件系統中的,所以決定研究一下rootfs根文件系統的讀寫方式。linux初始化的時候,第一個掛載的根文件系統本質上是個ramfs文件系統。ramfs文件系統只存在於ram中。先看一下根文件系統的初始化過程。
1 初始化
start_kernel
----------->vfs_caches_init
-------------->mnt_init
在mnt_init中進行根文件系統的初始化:
void __init mnt_init(void)
{
unsigned u;
int err;
init_rwsem(&namespace_sem);
。。。。。。。。。。。。。。。。。。。。。。
fs_kobj = kobject_create_and_add("fs", NULL);
if (!fs_kobj)
printk(KERN_WARNING "%s: kobj create error\n", __func__);
init_rootfs(); //註冊rootfs根文件系統
init_mount_tree();//掛載根文件系統
}
init_rootfs完成了rootfs文件系統的註冊:
static struct file_system_type rootfs_fs_type = {
.name = "rootfs",
.mount = rootfs_mount,
.kill_sb = kill_litter_super,
};
int __init init_rootfs(void)
{
int err;
err = bdi_init(&ramfs_backing_dev_info);
if (err)
return err;
err = register_filesystem(&rootfs_fs_type);
if (err){
bdi_destroy(&ramfs_backing_dev_info);
}
return err;
}
init_mount_tree完成rootfs文件系統的掛載,爲rootfs文件系統初始化掛載點,並創建了‘/’根目錄dentry,以及inode節點。並把該根路徑賦值給當前進程。
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct mnt_namespace *ns;
struct path root;
struct file_system_type *type;
type = get_fs_type("rootfs");
if (!type)
panic("Can't find rootfs type");
mnt = vfs_kern_mount(type, 0, "rootfs", NULL);//掛載根文件系統
put_filesystem(type);
if (IS_ERR(mnt))
panic("Can't create rootfs");
ns = create_mnt_ns(mnt);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
root.mnt = mnt;//記錄掛載點
root.dentry = mnt->mnt_root;//記錄根目錄的dentry
set_fs_pwd(current->fs, &root);//設置進程的當前路徑爲根文件系統根目錄
set_fs_root(current->fs, &root);//設置進程的根路徑爲根文件系統根目錄
}
主要的掛載工作是在vfs_kern_mount中做的
vfs_kern_mount
------------>mount_fs
在mount_fs中調用上面的rootfs文件系統中的rootfs_mount函數來掛載:
static struct dentry *rootfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
}
struct dentry *mount_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
int error;
struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);//分配一個super_block並初始化
if (IS_ERR(s))
return ERR_CAST(s);
error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);//再一次初始化superblock,併爲根節點分配dentry和inode結構
if (error) {
deactivate_locked_super(s);
return ERR_PTR(error);
}
s->s_flags |= MS_ACTIVE;
return dget(s->s_root);
}
fill_super函數就是ramfs_fill_super:
int ramfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct ramfs_fs_info *fsi;
struct inode *inode;
int err;
save_mount_options(sb, data);
fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
sb->s_fs_info = fsi;
if (!fsi)
return -ENOMEM;
err = ramfs_parse_options(data, &fsi->mount_opts);
if (err)
return err;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
sb->s_time_gran = 1;
inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);//新建inode
sb->s_root = d_make_root(inode);//新建dentry
if (!sb->s_root)
return -ENOMEM;
return 0;
}
看一下ramfs是如何新建inode的:
struct inode *ramfs_get_inode(struct super_block *sb,
const struct inode *dir, umode_t mode, dev_t dev)
{
struct inode * inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
break;
case S_IFREG: //普通文件的操作函數
inode->i_op = &ramfs_file_inode_operations;
inode->i_fop = &ramfs_file_operations;
break;
case S_IFDIR: //普通目錄的操作函數
inode->i_op = &ramfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
break;
}
}
return inode;
}
上面inode的初始化比較重要,裏面操作集函數在文件讀寫的時候都會用到。下面讀寫的時候會具體分析到。關於文件系統mount更詳細的分析,可以參考這篇文章:
https://blog.csdn.net/oqqYuJi12345678/article/details/101689334
2 文件打開(創建)
下面以一個具體文件爲例,分析整個讀寫過程。假如我們對/test文件進行讀寫操作。test文件就位於根文件系統的根目錄下面。第一次打開/test文件,並設置了create位,打開的過程中如果沒有該文件,會臨時創建。上面掛載根文件系統的時候,已經建立了根目錄。詳細的文件系統打開操作可以參考這篇文章,重複部分這邊不再展開介紹:
https://blog.csdn.net/oqqYuJi12345678/article/details/101849978
在路徑查找的時候,從根目錄開始找,從上面可知,根目錄的dentry和inode已經存在。所以在do_sys_open函數中,找到根目錄的dentry和inode以後,如果設置了O_CREAT位,調用lookup_open函數,在lookup_open函數中完成test節點的dentry和inode節點的創建。
do_sys_open
----------->do_filp_open
-------------->path_openat
-------------->do_last
----------------->lookup_open
static int lookup_open(struct nameidata *nd, struct path *path,
struct file *file,
const struct open_flags *op,
bool got_write, int *opened)
{
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
struct dentry *dentry;
int error;
bool need_lookup;
*opened &= ~FILE_CREATED;
//完成dentry的創建
dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
//如果dentry的inode還沒有創建,並且打開的時候設置了O_CREAT位
if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
umode_t mode = op->mode;
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current_umask();
/*
* This write is needed to ensure that a
* rw->ro transition does not occur between
* the time when the file is created and when
* a permanent write count is taken through
* the 'struct file' in finish_open().
*/
if (!got_write) {
error = -EROFS;
goto out_dput;
}
*opened |= FILE_CREATED;
error = security_path_mknod(&nd->path, dentry, mode, 0);
if (error)
goto out_dput;
//完成inode的創建
error = vfs_create(dir->d_inode, dentry, mode,
nd->flags & LOOKUP_EXCL);
if (error)
goto out_dput;
}
out_no_open:
path->dentry = dentry;
path->mnt = nd->path.mnt;
return 1;
out_dput:
dput(dentry);
return error;
}
看一下vfs_create的實現:
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool want_excl)
{
int error = may_create(dir, dentry);
if (error)
return error;
if (!dir->i_op->create)
return -EACCES; /* shouldn't it be ENOSYS? */
mode &= S_IALLUGO;
mode |= S_IFREG;
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
//調用了父節點inode的i_op操作集的create方法來新建本節點的inode
error = dir->i_op->create(dir, dentry, mode, want_excl);
if (!error)
fsnotify_create(dir, dentry);
return error;
}
父節點inode的i_op操作集的create方法,具體是哪個函數呢,這邊父節點其實就是根目錄,那麼該方法是根目錄inode節點的create方法。從第一節的初始化可以知道,rootfs文件系統的inode節點,代表目錄時,i_op操作集函數爲ramfs_dir_inode_operations:
static const struct inode_operations ramfs_dir_inode_operations = {
.create = ramfs_create,
.lookup = simple_lookup,
.link = simple_link,
.unlink = simple_unlink,
.symlink = ramfs_symlink,
.mkdir = ramfs_mkdir,
.rmdir = simple_rmdir,
.mknod = ramfs_mknod,
.rename = simple_rename,
};
create函數爲ramfs_create
static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
{
return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
}
static int
ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
int error = -ENOSPC;
if (inode) {
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
}
return error;
}
可以看到最終調用了ramfs_get_inode爲test文件建立了inode。這邊需要注意的是,在分配並初始化inode的時候,在ramfs_get_inode中,inode->i_mapping->a_ops = &ramfs_aops;,下面會用到該操作函數集。
至此test文件的inode和dentry都建立完畢。最終調用finish_open函數完成inode和file描述符的綁定。
do_last
-------->finish_open
int finish_open(struct file *file, struct dentry *dentry,
int (*open)(struct inode *, struct file *),
int *opened)
{
int error;
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
file->f_path.dentry = dentry;//和dentry綁定
error = do_dentry_open(file, open, current_cred());
if (!error)
*opened |= FILE_OPENED;
return error;
}
static int do_dentry_open(struct file *f,
int (*open)(struct inode *, struct file *),
const struct cred *cred)
{
static const struct file_operations empty_fops = {};
struct inode *inode;
int error;
f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
if (unlikely(f->f_flags & O_PATH))
f->f_mode = FMODE_PATH;
path_get(&f->f_path);
inode = f->f_inode = f->f_path.dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
error = __get_file_write_access(inode, f->f_path.mnt);
if (error)
goto cleanup_file;
if (!special_file(inode->i_mode))
file_take_write(f);
}
--------------------------------------------------------------(1)
f->f_mapping = inode->i_mapping;
file_sb_list_add(f, inode->i_sb);
if (unlikely(f->f_mode & FMODE_PATH)) {
f->f_op = &empty_fops;
return 0;
}
-----------------------------------------------------------------(2)
//把inode的i_fop賦值給f->f_op,下面讀寫的時候會用到
f->f_op = fops_get(inode->i_fop);
error = security_file_open(f, cred);
if (error)
goto cleanup_all;
error = break_lease(inode, f->f_flags);
if (error)
goto cleanup_all;
if (!open && f->f_op)
open = f->f_op->open;
if (open) {
error = open(inode, f);//沒有提供open函數
if (error)
goto cleanup_all;
}
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
return 0;
.。。。。。。。。。。。。。。。。。。。。。。。。。。。
}
(1)把inode的i_mapping賦值給f->f_mapping,i_mapping主要用來記錄當前文件的空間,下面讀寫的時候會用到
(2)由於test是文件,所以inode的i_fop操作函數集是ramfs_file_operations,下面讀寫的時候會用到
下面看一下如何向test節點寫數據。
3文件寫
寫的時候會調用到內核函數vfs_write:
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
file_start_write(file);
if (file->f_op->write)//先判斷file->f_op->write存在不
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
}
return ret;
}
在上面函數中先判斷file->f_op->write在不在,在上面的open操作中,我們知道file->f_op操作集函數爲ramfs_file_operations:
const struct file_operations ramfs_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.write = do_sync_write,
.aio_write = generic_file_aio_write,
.mmap = generic_file_mmap,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
所以調用do_sync_write函數進行寫操作:
ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
inti_sync_kiob沒什麼花頭的,重點無非是kiocb.ki_filp=filp。還有就是將要傳入的buff信息打包至iov。然後又調用了aio_write函數。該函數爲generic_file_aio_write:
ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
//inode初始化的時候,在inode_init_always函數中,有
//struct address_space *const mapping = &inode->i_data;
//mapping->host = inode;
//inode->i_mapping = mapping;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
err = generic_write_sync(file, pos, ret);
if (err < 0 && ret > 0)
ret = err;
}
return ret;
}
接着又調用了__generic_file_aio_write:
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
loff_t pos;
ssize_t written;
ssize_t err;
ocount = 0;
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
if (err)
return err;
count = ocount;
pos = *ppos;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
written = 0;
。。。。。。。。。。。。。。。。。
written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, ppos, count, written);
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
一般情況下是不帶O_DIRECT標誌的,所以省去了不重要的代碼,可以看到關鍵函數又調用了generic_file_buffered_write。
對這個函數的入口參數進行分析,iocb還是那個icob。iov,nr_segs都是一樣的,nr_segs爲1。在文件大小沒有突破限制的情況下,pos=ppos文件寫起始位置,文件長度count=iov->iov_len,written爲0。
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
size_t count, ssize_t written)
{
struct file *file = iocb->ki_filp;
ssize_t status;
struct iov_iter i;
//把相關信息都填入到i中,i->iov = iov;i->nr_segs = nr_segs;i->iov_offset = 0;
//i->count = count;
iov_iter_init(&i, iov, nr_segs, count, written);
status = generic_perform_write(file, &i, pos);
if (likely(status >= 0)) {
written += status;
*ppos = pos + status;
}
return written ? written : status;
}
核心函數爲generic_perform_write:
static ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
--------------------------------------------------------(1)
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
unsigned int flags = 0;
/*
* Copies from kernel address space cannot fail (NFSD is a big user).
*/
if (segment_eq(get_fs(), KERNEL_DS))
flags |= AOP_FLAG_UNINTERRUPTIBLE;
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1));//寫的起始地址在一個頁內的偏移
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_count(i));
//iov_iter_count爲i->count,即要寫入的文件長度,取要寫入的第一個頁的剩餘空間和文件大小
//做比較,如果文件較小,那麼當前寫入的bytes就是文件大小,如果文件較大,說明寫入內容不止當前
//頁,那麼這次寫入的字數先寫到當前頁的剩餘空間中
again:
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*
* Not only is this an optimisation, but it is also required
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
break;
}
--------------------------------------------------------------(2)
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status))
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
pagefault_disable();
---------------------------------------------------------------(3)
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
pagefault_enable();
flush_dcache_page(page);
mark_page_accessed(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
break;
copied = status;
cond_resched();
//根據寫入的字數,更新要寫入文件的大小count
iov_iter_advance(i, copied);
if (unlikely(copied == 0)) {
/*
* If we were unable to copy any data at all, we must
* fall back to a single segment length write.
*
* If we didn't fallback here, we could livelock
* because not all segments in the iov can be copied at
* once without a pagefault.
*/
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_single_seg_count(i));
goto again;
}
//把當前文件指針指向到寫入內容的後面
pos += copied;
//統計寫入的字節
written += copied;
balance_dirty_pages_ratelimited(mapping);
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
}
} while (iov_iter_count(i));//判斷count是否爲0,不爲0則還沒寫完,繼續loop
return written ? written : status;
}
(1)mapping->a_ops操作函數集爲ramfs_aops,上面初始化inode的時候賦值:
const struct address_space_operations ramfs_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
.set_page_dirty = __set_page_dirty_no_writeback,
};
(2)write_begin函數爲simple_write_begin,參數len爲此次要寫入的長度,肯定小於等於一頁:
int simple_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct page *page;
pgoff_t index;
index = pos >> PAGE_CACHE_SHIFT;//計算文件寫入位置在整個文件的第幾個頁中
//根據頁的index,來查找maping中管理的頁是否存在,沒有,則申請一個新頁,給mapping管理,後面寫入的地址就是這個頁
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
}
return 0;
}
可以看到上面根據要寫入的文件的光標,來計算當前要寫入頁的index。從這邊就可以推斷出。這個文件是按頁來管理的,並都存放在mapping結構中。如果這個文件是8K,那麼需要兩個頁。如果寫入位置是6K 處,那麼應該index是1,從第二個頁開始寫入。具體看一下grab_cache_page_write_begin函數:
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags)
{
int status;
gfp_t gfp_mask;
struct page *page;
gfp_t gfp_notmask = 0;
gfp_mask = mapping_gfp_mask(mapping);
if (mapping_cap_account_dirty(mapping))
gfp_mask |= __GFP_WRITE;
if (flags & AOP_FLAG_NOFS)
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);//根據index查找mapping結構中是否已經有該頁
if (page)//找到則直接返回
goto found;
//如果沒有找到,則分配一個新的頁
page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
if (!page)
return NULL;
//把該頁加入mapping結構中
status = add_to_page_cache_lru(page, mapping, index,
GFP_KERNEL & ~gfp_notmask);
if (unlikely(status)) {
page_cache_release(page);
if (status == -EEXIST)
goto repeat;
return NULL;
}
found:
wait_for_stable_page(page);
return page;
}
(3)iov_iter_copy_from_user_atomic函數完成具體的內容寫操作:
size_t iov_iter_copy_from_user_atomic(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
{
char *kaddr;
size_t copied;
BUG_ON(!in_atomic());
kaddr = kmap_atomic(page);//獲取該頁的邏輯地址
if (likely(i->nr_segs == 1)) {
int left;
char __user *buf = i->iov->iov_base + i->iov_offset;//要複製的數據的原地址
left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);//進行數據複製
copied = bytes - left;
} else {
copied = __iovec_copy_from_user_inatomic(kaddr + offset,
i->iov, i->iov_offset, bytes);
}
kunmap_atomic(kaddr);
return copied;
}
至此寫文件分析完了,再來看一下讀文件,過程是類似的。
4文件讀
vfs_read
----------->do_sync_read
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}
參數buf是讀的目的地址,len是讀取長度,ppos是讀文件的起始位置。filp->f_op->aio_read爲generic_file_aio_read:
ssize_t
generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
count = 0;
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (retval)
return retval;
。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。。
count = retval;
//nr_segs爲1
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc;
loff_t offset = 0;
/*
* If we did a short DIO read we need to skip the section of the
* iov that we've already read data into.
*/
if (count) {
if (count > iov[seg].iov_len) {
count -= iov[seg].iov_len;
continue;
}
offset = count;
count = 0;
}
desc.written = 0;
desc.arg.buf = iov[seg].iov_base + offset;//存儲數據區域
desc.count = iov[seg].iov_len - offset;//讀取長度
if (desc.count == 0)
continue;
desc.error = 0;
do_generic_file_read(filp, ppos, &desc, file_read_actor);
retval += desc.written;
if (desc.error) {
retval = retval ?: desc.error;
break;
}
if (desc.count > 0)
break;
}
out:
return retval;
}
直接看do_generic_file_read函數:
static void do_generic_file_read(struct file *filp, loff_t *ppos,
read_descriptor_t *desc, read_actor_t actor)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
int error;
index = *ppos >> PAGE_CACHE_SHIFT;//讀取的內容起始地址在mapping中的第一個頁
prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;//偏移,可以理解成從第幾個字節開始讀
for (;;) {
struct page *page;
pgoff_t end_index;
loff_t isize;
unsigned long nr, ret;
cond_resched();
find_page:
page = find_get_page(mapping, index);//在mapping結構中找到要開始讀的那個頁
if (!page) {
page_cache_sync_readahead(mapping,
ra, filp,
index, last_index - index);
page = find_get_page(mapping, index);
if (unlikely(page == NULL))
goto no_cached_page;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
/* Did it get truncated before we got the lock? */
if (!page->mapping)
goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
desc, offset))
goto page_not_up_to_date_locked;
unlock_page(page);
}
page_ok:
/*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
//讀出inode->i_size的長度,也就是文件數據的長度
isize = i_size_read(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;//inode的數據有多少頁
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
if (index == end_index) {
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
page_cache_release(page);
goto out;
}
}
//上面是對nr的調整,假如是在最後一頁了,那麼nr就是實際塊長度
//如果不是最後頁,那麼就是整頁
nr = nr - offset;//要讀的數據。
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (prev_index != index || offset != prev_offset)
mark_page_accessed(page);
prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
page_cache_release(page);
if (ret == nr && desc->count)
continue;
goto out;
page_not_up_to_date:
/* Get exclusive access to the page ... */
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
continue;
}
/* Did somebody else fill it already? */
if (PageUptodate(page)) {
unlock_page(page);
goto page_ok;
}
readpage:
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
* PG_error will be set again if readpage fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page);
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
goto find_page;
}
goto readpage_error;
}
if (!PageUptodate(page)) {
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
* invalidate_mapping_pages got it
*/
unlock_page(page);
page_cache_release(page);
goto find_page;
}
unlock_page(page);
shrink_readahead_size_eio(filp, ra);
error = -EIO;
goto readpage_error;
}
unlock_page(page);
}
goto page_ok;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
page_cache_release(page);
goto out;
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
page = page_cache_alloc_cold(mapping);
if (!page) {
desc->error = -ENOMEM;
goto out;
}
error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
page_cache_release(page);
if (error == -EEXIST)
goto find_page;
desc->error = error;
goto out;
}
goto readpage;
}
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_CACHE_SHIFT;
ra->prev_pos |= prev_offset;
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
file_accessed(filp);
}
上面一大堆代碼裏真正在讀取數據的其實是那個回調函數,其他都只是在index,offset,nr之類的,也就是這個回調函數的參數而已。ret = actor(desc, page, offset, nr);
四個入口參數,分別是描述符,頁,偏移,讀取的字節數。:
int file_read_actor(read_descriptor_t *desc, struct page *page,
unsigned long offset, unsigned long size)
{
char *kaddr;
unsigned long left, count = desc->count;
if (size > count)
size = count;
/*
* Faults on the destination of a read are common, so do it before
* taking the kmap.
*/
if (!fault_in_pages_writeable(desc->arg.buf, size)) {
kaddr = kmap_atomic(page);//得到頁的邏輯地址
left = __copy_to_user_inatomic(desc->arg.buf,
kaddr + offset, size);//從文件頁的邏輯地址中拷貝內容到目的地址
kunmap_atomic(kaddr);
if (left == 0)
goto success;
}
/* Do it the slow way */
kaddr = kmap(page);
left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
kunmap(page);
if (left) {
size -= left;
desc->error = -EFAULT;
}
success:
desc->count = count - size;
desc->written += size;
desc->arg.buf += size;//指針跳向後面
return size;
}
這個回調函數裏,就是把kaddr+offset上的size的數據,copy到描述符裏buf裏。並且des->count減少,buf指向後面。至此open,write和read過程都已經分析完了。