linux內核管道pipe實現詳解
(文件系統暫時不是很瞭解,文件系統部分暫時不做解釋,此文僅解釋關鍵流程,系統調用部分請參考前面已經發布的文章,這裏不做展開)
1、管道系統調用(SyS_pipe)
1.1、SyS_pipe
/* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { struct file *files[2]; int fd[2]; int error; error = __do_pipe_flags(fd, files, flags); // glibc管道創建函數原型"int pipe(int pipefd[2])",內核需要創建兩個管道文件描述符(fd, file) if (!error) { if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { // 拷貝文件描述符到用戶空間 fput(files[0]); fput(files[1]); put_unused_fd(fd[0]); put_unused_fd(fd[1]); error = -EFAULT; } else { fd_install(fd[0], files[0]); // 安裝文件描述符,將fd放置到當前任務files->fd數組中,fd與file關聯(fd是個整形數據,file包括了文件信息及操作函數信息,應用程系統調用時傳的是fd,最終需要找的file來獲取文件真正信息及讀寫函數等) fd_install(fd[1], files[1]); // 安裝文件描述符,將fd放置到當前任務files->fd數組中 } } return error; } SYSCALL_DEFINE1(pipe, int __user *, fildes) { return sys_pipe2(fildes, 0); }
1.2、create_pipe_files
struct file { union { struct llist_node fu_llist; struct rcu_head fu_rcuhead; } f_u; struct path f_path; struct inode *f_inode; /* cached value */ const struct file_operations *f_op; // 文件操作函數指針(read, write等) /* * Protects f_ep_links, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra; u64 f_version; \#ifdef CONFIG_SECURITY void *f_security; \#endif /* needed for tty driver, and maybe others */ void *private_data; \#ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; struct list_head f_tfile_llink; \#endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; } __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
static int __do_pipe_flags(int *fd, struct file **files, int flags) { int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) return -EINVAL; error = create_pipe_files(files, flags); // // 創建2個管道file文件結構體,主要綁定pipe_read、pipe_write即管道讀寫等相關函數 if (error) return error; error = get_unused_fd_flags(flags); // 獲取當前進程未使用的文件描述符作爲讀管道描述符 if (error < 0) goto err_read_pipe; fdr = error; error = get_unused_fd_flags(flags); // 獲取當前進程未使用的文件描述符作爲寫管道描述符 if (error < 0) goto err_fdr; fdw = error; audit_fd_pair(fdr, fdw); fd[0] = fdr; // 返回給用戶的管道文件描述符 fd[1] = fdw; // 返回給用戶的管道文件描述符 return 0; err_fdr: put_unused_fd(fdr); err_read_pipe: fput(files[0]); fput(files[1]); return error; }
int get_unused_fd_flags(unsigned flags) { return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); // current是個宏,通過sp低13位清零獲取到當前線程的thread_info,再通過thread_info獲取到當前任務的task信息,task_struct->files }
int create_pipe_files(struct file **res, int flags) { int err; struct inode *inode = get_pipe_inode(); struct file *f; struct path path; static struct qstr name = { .name = "" }; if (!inode) return -ENFILE; err = -ENOMEM; path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); if (!path.dentry) goto err_inode; path.mnt = mntget(pipe_mnt); d_instantiate(path.dentry, inode); f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); // 創建file結構體(res[1]),用pipfifo_fops初始化 if (IS_ERR(f)) { err = PTR_ERR(f); goto err_dentry; } f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); f->private_data = inode->i_pipe; res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); // 創建file結構體(res[0]),用pipfifo_fops初始化 if (IS_ERR(res[0])) { err = PTR_ERR(res[0]); goto err_file; } path_get(&path); res[0]->private_data = inode->i_pipe; res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); res[1] = f; return 0; err_file: put_filp(f); err_dentry: free_pipe_info(inode->i_pipe); path_put(&path); return err; err_inode: free_pipe_info(inode->i_pipe); iput(inode); return err; }
2、管道寫入(SyS_write)
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { struct fd f = fdget_pos(fd); // 獲取文件file等信息(file包括文件操作函數、文件信息等),對於應用程序,用戶並不關注文件時管道還是設備,一律按找文件操作,有vfs來區分具體的文件操作 ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); // 獲取文件偏移 ret = vfs_write(f.file, buf, count, &pos); // vfs寫文件 if (ret >= 0) file_pos_write(f.file, pos); // 有數據寫入文件,更新文件偏移 fdput_pos(f); } return ret; }
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; if (!(file->f_mode & FMODE_WRITE)) // 寫權限檢查 return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) // 寫權限檢查 return -EINVAL; if (unlikely(!access_ok(VERIFY_READ, buf, count))) return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); if (ret >= 0) { count = ret; file_start_write(file); ret = __vfs_write(file, buf, count, pos); // 寫文件 if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); } inc_syscw(current); file_end_write(file); } return ret; }
ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, loff_t *pos) { if (file->f_op->write) return file->f_op->write(file, p, count, pos); // new_sync_write else if (file->f_op->write_iter) return new_sync_write(file, p, count, pos); else return -EINVAL; }
static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; struct kiocb kiocb; struct iov_iter iter; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; iov_iter_init(&iter, WRITE, &iov, 1, len); ret = filp->f_op->write_iter(&kiocb, &iter); // pipe_write BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; }
struct iov_iter { int type; size_t iov_offset; size_t count; // 需要寫入管道的數據長度 union { const struct iovec *iov; // iov->iov_base保存需要寫入管道的數據,iov->iov_len保存需要寫入管道的數據長度 const struct kvec *kvec; const struct bio_vec *bvec; }; unsigned long nr_segs; };
static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; ssize_t ret = 0; int do_wakeup = 0; size_t total_len = iov_iter_count(from); // 需要寫入管道的數據長度 ssize_t chars; /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; __pipe_lock(pipe); if (!pipe->readers) { // 讀管道端已經關閉,寫管道無意義,返回-EPIPE send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ // 不是PAGE_SIZE大小的數據 if (pipe->nrbufs && chars != 0) { int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (pipe->buffers - 1); // curbuf: the current pipe buffer entry,nrbufs: the number of non-empty pipe buffers in this pipe,buffers: total number of buffers (should be a power of 2),獲取下一個可用buff,這條語句類似取餘,假如管道buff是從0-9,有效數據起始buff是5,有效buff數據是6,那麼存儲管道數據的buff依次爲"5,6,7,8,9,0",下一個可用的buff就是1((5 + 6)/10),把10位清0了,&運算符也是把二進制的高位清0了,實現了一個類似的循環鏈表操作 struct pipe_buffer *buf = pipe->bufs + lastbuf; // 獲取下一個可以buffer的地址(上一行代碼只是獲取了索引,類似數組的下標) const struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; // offset: offset of data inside the @page,len: length of data inside the @page,目前數據在page中的有效起始地址 + 有效數據長度 = 下一個可存放數據的地址 (管道是從前往後讀的,並沒規定讀寫大小,有可能只讀取了page的前一部分,中間部分尚未讀取,但是寫的時候必須從中間有效數據後繼續寫) if (ops->can_merge && offset + chars <= PAGE_SIZE) { // 當前需要寫入的數據 + 已有的數據 沒有超過PAGE_SIZE大小,可以拷貝到page裏面 ret = ops->confirm(pipe, buf); if (ret) goto out; ret = copy_page_from_iter(buf->page, offset, chars, from); // 從from(用戶寫的數據)拷貝到offset處 if (unlikely(ret < chars)) { ret = -EFAULT; // 寫的數據少於預期的,出錯返回(前面已經計算空間足夠,要是寫入與預期不一致,肯定出問題了) goto out; } do_wakeup = 1; buf->len += ret; // 該page已經寫入了ret字節數據,更新有效數據長度 if (!iov_iter_count(from)) goto out; // 拷貝之後,from裏面已經沒有數據需要寫了,全部數據已經寫入到管道了,不需要再寫 } } for (;;) { int bufs; if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } bufs = pipe->nrbufs; // // for循換之前寫小於PAGE_SIZE的數據,並沒有寫在新的page裏面,此處獲取當前管道有多少有效bufs if (bufs < pipe->buffers) { // 有效bufs小於管道總buffers int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); // 獲取下一有效buff index,前面已經解釋了,此處實現的類似循環鏈表功能 struct pipe_buffer *buf = pipe->bufs + newbuf; // 獲取下一有效pipe_buffer地址(起始地址+索引) struct page *page = pipe->tmp_page; int copied; if (!page) { page = alloc_page(GFP_HIGHUSER); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; } pipe->tmp_page = page; } /* Always wake up, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to * syscall merging. * FIXME! Is this really true? */ do_wakeup = 1; copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); // 前面獲取buff的時候獲取的是完全沒有數據的buff,因此此處拷貝到0偏移處,PAGE_SIZE大小數據 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) ret = -EFAULT; // from剩餘數據不爲0,但是拷貝的數據不足PAGE_SIZE,也就是空間足夠但是數據寫入失敗,出錯,退出拷貝 break; } ret += copied; // 已拷貝的數據增加,需要返回給應用程序,告訴應用程序實際寫了多少數據 /* Insert it into the buffer array */ buf->page = page; // buf->tmp_page -> buf->page buf->ops = &anon_pipe_buf_ops; buf->offset = 0; // 有效數據的偏移地址 buf->len = copied; // 有效數據的長度 buf->flags = 0; if (is_packetized(filp)) { buf->ops = &packet_pipe_buf_ops; buf->flags = PIPE_BUF_FLAG_PACKET; } pipe->nrbufs = ++bufs; // 管道有效bufs數目增加 pipe->tmp_page = NULL; if (!iov_iter_count(from)) break; // from沒有數據,所有數據已經寫入管道 } if (bufs < pipe->buffers) // 有效bufs小於管道總buffers,還有bufs可以寫數據,繼續將用戶數據寫入管道 continue; if (filp->f_flags & O_NONBLOCK) { if (!ret) ret = -EAGAIN; // 剩餘bufs不夠,但是是非阻塞方式調用,返回給應用程序-EAGAIN,表示讓重新嘗試,而不是失敗 break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); // 喚醒等待列表(此處有個疑問,管道通常是一端讀一端寫,是否存在多進程讀寫的情況,按照一端讀,一端寫的情況,被喚醒的肯定只有讀進程,還沒看到過多進程讀寫的情況,細節後續再看,暫時先寫到這裏) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } pipe->waiting_writers++; // buffer不夠,等待buffer被讀取後再繼續寫入,等待寫入計數器加1,再其他任務讀buffer數據之後喚醒等待寫buffer的任務 pipe_wait(pipe); // 設置當前任務狀態爲TASK_INTERRUPTIBLE,將當前任務添加到等待列表,釋放互斥鎖等,執行調度,切換到其他任務 pipe->waiting_writers--; // 等待寫計數器減1,重新嘗試寫管道 } out: __pipe_unlock(pipe); if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; sb_end_write(file_inode(filp)->i_sb); } return ret; }
3、管道讀取(SyS_read)
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_read(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); fdput_pos(f); } return ret; }
static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; int do_wakeup; ssize_t ret; /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; do_wakeup = 0; ret = 0; __pipe_lock(pipe); for (;;) { int bufs = pipe->nrbufs; // bufs相關操作參考寫管道函數,原理與寫管道一致 if (bufs) { int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; const struct pipe_buf_operations *ops = buf->ops; size_t chars = buf->len; size_t written; int error; if (chars > total_len) chars = total_len; error = ops->confirm(pipe, buf); if (error) { if (!ret) ret = error; break; } written = copy_page_to_iter(buf->page, buf->offset, chars, to); if (unlikely(written < chars)) { if (!ret) ret = -EFAULT; break; } ret += chars; buf->offset += chars; buf->len -= chars; /* Was it a packet buffer? Clean up and exit */ if (buf->flags & PIPE_BUF_FLAG_PACKET) { total_len = chars; buf->len = 0; } if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); curbuf = (curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = curbuf; pipe->nrbufs = --bufs; do_wakeup = 1; } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ } if (bufs) /* More to do? */ // bufs不爲空,繼續讀 continue; if (!pipe->writers) // 管道寫端已經關閉,且bufs爲空,沒有繼續等待的必要,永遠沒法再讀到數據 break; // 沒有等待寫管道的進程 if (!pipe->waiting_writers) { // 有等待寫管道的進程 /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then * we can wait for that data without violating POSIX. */ if (ret) break; // 已經讀取到部分數據,直接返回已讀取的數據 if (filp->f_flags & O_NONBLOCK) { ret = -EAGAIN; // 非阻塞狀態,且沒有讀到數據,返回-EAGAIN,稍微再試 break; } } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); // 喚醒等待任務鏈表 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } pipe_wait(pipe); // 前面管道寫端已關閉會返回,還有數據會繼續讀,沒有已讀取到數據且沒有等待寫管道的進程會返回,非阻塞模式會返回,因此阻塞模式下沒有讀到數據且管道寫端沒有關閉纔會執行這條語句(等待其他進程往管道寫數據),將但前task添加到寫鏈表,切換到其他進程 } __pipe_unlock(pipe); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); return ret; }