本文章源碼基於kernel 5.5版本,主要分析epoll在kernel的實現原理,主要源碼在 kernel/fs/eventpoll.c。
目錄
一,關鍵結構體
關鍵結構體主要有以下2個需要留意,裏邊包含的成員用來幹嘛同學們看代碼應該看得懂,在此只是簡單提一下。
struct eventpoll ,struct epitem,這邊需要留意的主要就這2個結構體,eventpoll每個進程調epoll_create()後會各創建一個,裏邊有個rdlist用來存epitem,而epitem裏邊存的就是所監聽的描述符的信息,包括,fd,file,events等。
eventpoll裏邊是用一顆紅黑樹來存epitem的,所以查找,插入,刪除的最壞時間複雜度是O(log(n))。
二,epoll啓動
源碼位置 /fs/eventpoll.c
從這開始,
fs_initcall(eventpoll_init);
static int __init eventpoll_init(void)
{
struct sysinfo si;
si_meminfo(&si);//獲取系統內存信息
/*
* Allows top 4% of lomem to be allocated for epoll watches (per user).
*/
//設置每個進程能監聽的fd數目
max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
EP_ITEM_COST;
//對於 EP_ITEM_COST,
//是由於每註冊一個fd需要用掉一個 struct epitem 和一個struct eppoll_entry
BUG_ON(max_user_watches < 0);
/*
* Initialize the structure used to perform epoll file descriptor
* inclusion loops checks.
*/
ep_nested_calls_init(&poll_loop_ncalls);//鏈表初始化
/*
* We can have many thousands of epitems, so prevent this from
* using an extra cache line on 64-bit (and smaller) CPUs
*/
BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
//創建緩存,用來存 (struct epitem 和 struct eppoll_entry
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
return 0;
}
好了,初始化完畢,接下來看 epoll_create
三,epoll_create
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return do_epoll_create(0);
}
用SYSCALL_DEFINE定義了系統調用接口epoll_create,只是對size做了個兼容性的判斷,
具體工作在 do_epoll_create裏邊,接着看
/*
* Open an eventpoll file descriptor.
*/
static int do_epoll_create(int flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency. */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);//針對這個進程創建一個 eventpoll(一般來說每個進程一個,畢竟咱們在一個進程裏只epoll_create一次嘛)
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
//獲取一個可用的文件描述符
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
//創建一個文件實例,將ep裝到其priv 裏邊,後面要用的時候通過該file拿出對應的eventpoll;
//並註冊對該file的操作接口eventpoll_fops
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
//將file裝到ep裏邊,後面也可以通過該ep拿出對應的file
//至此,eventpoll和其對應的file形成你中有我我中有你的聯繫,每個進程調epoll_create 後擁有一個eventpoll,一個fd,一個file
ep->file = file;
//將fd和file掛鉤起來,文件系統那塊目前還沒有去研究源碼不太清楚,不過我覺得大概應該是之後可以通過
//該fd操作file吧,比如你對該fd調用llseek,那麼後面就會調用前面註冊的eventpoll_fops::noop_llseek,具體實現就在這裏邊做
fd_install(fd, file);
return fd;//返回fd給用戶進程
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
這邊可以順便看下 創建eventpoll時做了什麼事情,
static int ep_alloc(struct eventpoll **pep)
{
int error;
struct user_struct *user;
struct eventpoll *ep;
user = get_current_user();//獲取當前用戶信息
error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL);//開闢內存
if (unlikely(!ep))
goto free_uid;
//鎖初始化
mutex_init(&ep->mtx);
rwlock_init(&ep->lock);
//隊列初始化
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
//等待隊列 redy list 初始化
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT_CACHED;//紅黑樹初始化,用來存監聽的fd的信息
ep->ovflist = EP_UNACTIVE_PTR;
ep->user = user;//當前用戶信息存在eventpoll裏邊
*pep = ep;
return 0;
free_uid:
free_uid(user);
return error;
}
好了,epoll_create 看完了,接着看epoll_ctl,這邊咱們沿着添加描述的主線去看,
即 EPOLL_CTL_ADD 選項的行爲。
四,epoll_ctl
同樣的,epoll_ctl用SYSCALL_DEFINE定義爲系統調用,
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL;
error = -EFAULT;
//獲取用戶空間傳進來的event的內容,獲取要監聽fd的類型(讀,寫等)
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
error = -EBADF;
f = fdget(epfd);//通過在epoll_create裏邊創建的fd(返回給用戶空間後又由用戶空間傳下來)獲取其對應的file
if (!f.file)
goto error_return;
/* Get the "struct file *" for the target file */
tf = fdget(fd); //獲取該socket對應的file
if (!tf.file)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
//若該socket沒有實現poll接口,直接返回(socket實現poll是因爲後面這邊後面回調用該poll,在後面的epoll_wait分析裏邊就會遇到)
if (!file_can_poll(tf.file))
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(&epds);
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
//把自己的fd傳進來了,不能這麼搞哦小夥子
error = -EINVAL;
if (f.file == tf.file || !is_file_epoll(f.file))
goto error_tgt_fput;
//......
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = f.file->private_data;//將之前epoll_create創建的eventpoll取出來,
//即: 用戶空間創efd下來,通過fd可以取得file,通過file再取出創建file時裝進去的eventpoll
//......
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tf.file, fd);//在該進程的紅黑樹裏邊找fd ,file對應的epitem,一邊來說第一次EPOLL_CTL_ADD找不到所以爲NULL
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= EPOLLERR | EPOLLHUP;
//創建eptime,保存該socket 對應的fd,file,以及eventpoll,然後插入紅黑樹
error = ep_insert(ep, &epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
clear_tfile_check_list();
break;
//......
}
//......
return error;
}
//所以其實epoll_ctl 添加監聽描述符時,主要做的工作是創建epitem,裏邊保存了該socket的fd,file,以及該進程的eventpoll。
(對於eventpoll,我在想有必要每個epitem都保存一個嗎?如果fd比較多的話,加上64位平臺,每個fd多用8字節內存,對於kernel
的內存還是有一定的開銷的,畢竟咱們每個進程一般只創建一個eventpoll,並且eventpoll指針也已經裝在epfd對應的file裏邊了呀(就是
epoll_create創建時裝的)所以我覺得要用的時候可以通過epfd獲取file再通過file->private_data獲取就可以了呢)
接下來看下如何創建epitem並插入紅黑樹的,以及在這裏邊幹了什麼
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
int error, pwake = 0;
__poll_t revents;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
lockdep_assert_irqs_enabled();//什麼都沒有做,不知寫這個用來幹嘛
//獲取該進程監聽的fd數目
user_watches = atomic_long_read(&ep->user->epoll_watches);
//超過能監聽的最大數目了
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
//創建一個epitem,從初始化時開闢的緩存裏邊拿內存來用
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
//epitem初始化
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;//裝eventpoll
ep_set_ffd(&epi->ffd, tfile, fd);//裝socket的fd,file
epi->event = *event;//存儲監聽事件類型
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
} else {
RCU_INIT_POINTER(epi->ws, NULL);
}
/* Initialize the poll table using the queue callback */
epq.epi = epi;
//這邊註冊回調,將callback函數裝到epq.pt._qproc裏邊
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
revents = ep_item_poll(epi, &epq.pt, 1);//這邊調用socket那邊的poll,後面得看下socket那邊的行爲(其實就是回調這個ep_ptable_queue_proc())
//......
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
//把 epi->fllink, 加到 tfile->f_ep_links裏邊,即添加到socket那邊的file的f_ep_links鏈表尾部
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
//將剛剛創建的epitem插入在epoll_create創建的紅黑樹裏邊(每個進程各有一顆)
ep_rbtree_insert(ep, epi);
//......
/* If the file is already "ready" we drop it inside the ready list */
//如果該socket已經有事件的話,無需睡眠等待,
//直接將其裝入eventpoll的redy list裏邊,然後通過wake_up(&ep->wq)調用 ep_poll_callback
if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available *///TODO 這邊做了什麼事情後面再看下
if (waitqueue_active(&ep->wq))
wake_up(&ep->wq);//TODO
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
write_unlock_irq(&ep->lock);
//這邊將監聽的fd數目加1
atomic_long_inc(&ep->user->epoll_watches);
return error;
}
該函數主要是創建一個epitem,調用socket那邊的poll接口,然後socket那邊會回調ep_ptable_queue_proc(),
裝上eventpoll,socket的fd和file,然後插入紅黑樹,
若監聽的fd已經有事件的話,直接將該fd的信息裝入evnetpoll的redy list裏邊,然後調用callback函數 ep_poll_callback()。
好的,接下來看下 ep_item_poll()和 ep_ptable_queue_proc(),ep_poll_callback()等到分析socket那邊產生數據後
通知eventpoll這邊時再分析。
先看 ep_item_poll()
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
int depth)
{
struct eventpoll *ep;
bool locked;
pt->_key = epi->event.events;//用戶空間設置的參數,讀/寫/listend
if (!is_file_epoll(epi->ffd.file))//走這邊
return vfs_poll(epi->ffd.file, pt) & epi->event.events;//進入vfs_poll()
}
看下vfs_poll() ,在 include/linux/poll.h 裏邊
static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
if (unlikely(!file->f_op->poll))
return DEFAULT_POLLMASK;
return file->f_op->poll(file, pt);//調用文件系統socket 那邊的poll ,接下來得看socket那邊的poll,咱們就看tcp的吧,應該都差不多
}
接下來就到了socket的poll這邊了,在 net/socket.c裏邊
// 記得wait 裏邊裝有eventpoll那邊的callback函數ep_ptable_queue_proc()
static __poll_t sock_poll(struct file *file, poll_table *wait)
{
struct socket *sock = file->private_data;
//......
return sock->ops->poll(file, sock, wait) | flag;
}
這邊轉而調用tcb那邊的poll,即 tcp_poll()那就接着看,源碼在 net/ipv4/tcp.c裏邊,
__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
__poll_t mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
int state;
sock_poll_wait(file, sock, wait);//這邊主要是這個,跟進去看下
//......
return mask;
}
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
{
if (!poll_does_not_wait(p)) {
//sock->wq.wait 留意這個,後面有事件的時候可能會用到
poll_wait(filp, &sock->wq.wait, p); //這邊再跟進去看
//......
}
}
源碼位置 /include/linux/poll.h
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);//這邊就去回調eventpoll的 ep_ptable_queue_proc()了
//將socket的file,該socket的等待隊列傳給ep_ptable_queue_proc()
}
好了,接下來看下 回到 eventpoll.c 看 ep_ptable_queue_proc()
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,//whead 是socket那邊傳過來的
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
//這邊也是針對這個socket fd創建一個eppoll_entry
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
//這邊註冊喚醒回調函數,由ep_poll_call來執行喚醒函數,代替default_wake_function()喚醒
//
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);// 這邊將ep_poll_callback掛載到eppoll_entry::wait裏邊,
//當socket那邊有數據時,會統一用__wake_up()喚醒whead,
//而對於 咱們這個pwq->wait,會調用 ep_poll_callback,然後再在callback裏邊去喚醒應用層調用epoll_wait的用戶進程
pwq->whead = whead;//等待隊列存socket的等待隊列,
pwq->base = epi;
if (epi->event.events & EPOLLEXCLUSIVE)
add_wait_queue_exclusive(whead, &pwq->wait);
else
add_wait_queue(whead, &pwq->wait); //這邊將eppoll_entry的等待隊列節點添加到socket的等待隊列上去
list_add_tail(&pwq->llink, &epi->pwqlist);///將eppoll_entry 添加到 epitem的pwqlist鏈表裏邊,後面就可以根據這個鏈表取出
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
這邊主要做的工作其實就是創建一個eppoll_entry,裝上ep_poll_callback()函數,然後註冊到socket的事件等待隊列裏邊,
後面socket那邊有事件後就會喚醒這個隊列的所有節點。
然後將eppoll_entry添加到eptiem裏邊,TODO 爲何要添加目前還沒有理清楚,後面得看下
好了,至此,epoll_ctl分析完了,接下來看epoll_wait
五, epoll_wait
同樣的,epoll_wait用SYSCALL_DEFINE定義爲系統調用,
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
return do_epoll_wait(epfd, events, maxevents, timeout);
}
//接着看 do_epoll_wait(),
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct fd f;
struct eventpoll *ep;
//......
/* Get the "struct file *" for the eventpoll file */
f = fdget(epfd);//通過epfd拿到eventpoll的file對象
if (!f.file)
return -EBADF;
//......
ep = f.file->private_data;
//從file->private_data裏邊取出對應的eventpoll,還記得嗎,這是在epoll_create的時候存進去
/* Time to fish for events ... */
//跟進去看
error = ep_poll(ep, events, maxevents, timeout);
//......
}
好接下來看ep_poll(),
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
//有設置超時的話換算一下時間
if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);
slack = select_estimate_accuracy(&end_time);
to = &expires;
*to = timespec64_to_ktime(end_time);
} else if (timeout == 0) {
timed_out = 1;
write_lock_irq(&ep->lock);
eavail = ep_events_available(ep);//檢查是否此時已經有事件就緒了,如果eventpoll::rdlist不爲空就是有事件了
write_unlock_irq(&ep->lock);
goto send_events;
//這邊先直接跳到 send_events 看是否已經有事件就緒,有的話就直接將事件返回給用戶空間了,
//沒有事件的話纔回跳到fetch_events這邊把自己調度出去
}
fetch_events:
//......
/*
* We don't have any available event to return to the caller. We need
* to sleep here, and we will be woken by ep_poll_callback() when events
* become available.
*/
//沒有事件,需要把自己調度出去,先睡會兒,等有事件後socket那邊回調ep_poll_callback(),那邊會喚醒這個進程
if (!waiter) {
waiter = true;
init_waitqueue_entry(&wait, current);//當前進程結構體,添加到wait節點裏邊
spin_lock_irq(&ep->wq.lock);
__add_wait_queue_exclusive(&ep->wq, &wait);//把wait節點添加到等待隊列 wq裏邊,
spin_unlock_irq(&ep->wq.lock);
}
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);//將進程設爲TASK_INTERRUPTIBLE狀態,可喚醒
//.......
eavail = ep_events_available(ep);
if (eavail)//若此時rdlist不爲空那就不用睡了,直接跳出去
break;
//.......
//這邊把把自己調度出去,後面醒來後又回到for循環的開始處,在 if (eavail)這邊再跳出去
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
timed_out = 1;//若是超時發生,則直接跳出去
break;
}
}
__set_current_state(TASK_RUNNING);//醒來後,把自己設爲運行態
send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)//這邊把事件和fd給到用戶空間並返回
goto fetch_events;
if (waiter) {
spin_lock_irq(&ep->wq.lock);
__remove_wait_queue(&ep->wq, &wait);//把wait從ep->wq裏邊拿出來
spin_unlock_irq(&ep->wq.lock);
}
return res;
}
這邊所做的事情主要是,先判斷有沒有事件已經就緒(判斷rdlist是否爲空),若有就緒則直接將
事件和fd返回給用戶空間,沒有就緒的話先把自己調度出去,然後等待socket那邊有事件後通過
回調 ep_poll_callback()來喚醒。
好的,至此,在咱們看ep_send_events()如何將事件給到用戶空間之前,
先看下socket那邊有事件的行爲,如何回調ep_poll_callback(),以及ep_poll_callback()做了什麼事情,
接下來再過頭來看ep_send_events().scoket那邊咱們就挑tcp ipv4的來看吧,
對於一個tcp socket,比如當網卡那邊有數據來臨時,觸發中斷,經過鏈路層,ip層報頭處理後,到了tcp這邊,報頭處理後,
會調用sock_def_readable(),咱們直接看下這個函數,
//y源碼位置 net/core/sock.c
static void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
//在這邊,會喚醒所有該socket的等待隊列裏邊的節點,即監聽該socket的進程,
//對於epoll就是在前邊epoll_ctl 添加監聽描述符的時候添加進來的,
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
對於 wake_up_interruptible_sync_poll(),具體實現在 /kernel/sched/wait.c __wake_up_common()裏邊,
若隊列裏邊的節點有實現自己的喚醒函數,先回調該函數,再由該函數來執行喚醒工作,
若沒有實現的話,使用默認的kernel/sched/core.c default_wake_function(),
對於eventpoll,自己實現了咱們提了好多次的ep_poll_callback(),好了,接下來看下這個函數實現,
/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
struct epitem *epi = ep_item_from_wait(wait);//從wait裏邊取出對應的epitem
struct eventpoll *ep = epi->ep;//從epitem裏邊取出對應的eventpoll
__poll_t pollflags = key_to_poll(key);
unsigned long flags;
int ewake = 0;
//.......
/* If this file is already in the ready list we exit soon */
//這邊把epitem::rdllink 添加到 rdllist裏邊
if (!ep_is_linked(epi) &&
list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
if ((epi->event.events & EPOLLEXCLUSIVE) &&
!(pollflags & POLLFREE)) {
switch (pollflags & EPOLLINOUT_BITS) {
case EPOLLIN:
if (epi->event.events & EPOLLIN)
ewake = 1;
break;
case EPOLLOUT:
if (epi->event.events & EPOLLOUT)
ewake = 1;
break;
case 0:
ewake = 1;
break;
}
}
wake_up(&ep->wq);//這邊去喚醒用戶進程
}
//......
return ewake;
}
該函數主要做了2個工作,將該socket對應的eptiem添加到eventpoll的rdlist裏邊,
若對應的用戶進程此時還在睡,通過wake_up() 喚醒,
好了,此時eventpoll::rdlist裏邊已經保存了該socket的事件信息,用戶進程(準確的說該進程此時還是內核態)也喚醒了,
咱們可以回到ep_poll()了,這邊接下來就到了 ep_send_events(),
繼續看,
static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
struct ep_send_events_data esed;
esed.maxevents = maxevents;
esed.events = events;//保存用戶空間傳進來的epoll_event地址
ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);//跟進去
return esed.res;
}
接下來看ep_scan_ready_list()
/**
* ep_scan_ready_list - Scans the ready list in a way that makes possible for
* the scan code, to call f_op->poll(). Also allows for
* O(NumReady) performance.
*/
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
__poll_t (*sproc)(struct eventpoll *,
struct list_head *, void *),
void *priv, int depth, bool ep_locked)
//priv 存有用戶傳進來的events,
{
__poll_t res;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
//......
/*
* Steal the ready list, and re-init the original one to the
* empty list. Also, set ep->ovflist to NULL so that events
* happening while looping w/out locks, are not lost. We cannot
* have the poll callback to queue directly on ep->rdllist,
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
//將rdlist賦值給txlist,rdlist初始化爲空鏈表
write_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, &txlist);
WRITE_ONCE(ep->ovflist, NULL);
write_unlock_irq(&ep->lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);//好了,接下來主要是這個,即 ep_send_events_proc(ep, &txlist, priv)
//......
return res;
}
看下 ep_send_events_proc()
static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
void *priv)
{
struct ep_send_events_data *esed = priv;//指針強轉重新獲得ep_send_events_data,裏邊裝有用戶空間傳進來的epoll_event地址,
__poll_t revents;
struct epitem *epi, *tmp;
struct epoll_event __user *uevent = esed->events;//取出用戶空間的epoll_event地址,接下來主要圍繞這個進行操作
struct wakeup_source *ws;
poll_table pt;
//.......
//這邊掃描整個rdlist鏈表
list_for_each_entry_safe(epi, tmp, head, rdllink) {
if (esed->res >= esed->maxevents)
break;
//.......
//把事件和fd裝到用戶空間的epoll_event
if (__put_user(revents, &uevent->events) ||
__put_user(epi->event.data, &uevent->data)) {
list_add(&epi->rdllink, head);
ep_pm_stay_awaake(epi);
if (!esed->res)
esed->res = -EFAULT;
return 0;
}
//......
return 0;
}
這邊主要做的事情就是掃描整個rdlist(不是此時的eventpoll::rdlist,該list已經是空的了,在ep_scan_ready_list()裏邊置換給txlist
後傳進來),然後將事件和fd裝到用戶空間的epoll_event裏邊,然後返回。
好了,到了這邊,就返回到用戶空間(epoll_wait返回),給應用層去處理了。