11.2 epoll系統調用

在完成listen系統調用後，作爲TCP Server的進程就可以等待接受連接請求了。當請求到來時，進程需要調用accept系統調用生成一個新的socket，並用之與客戶端傳輸數據。這時進程需要管理的socket有兩類：1）等待請求到來並與之建立連接的socket；2）已經與客戶端建立的一對一的連接並與之進行數據傳輸的socket。當這些socket的數量很多時，如何及時獲知哪些socket有可讀|可寫等I/O事件通告到來並對其進行處理，就成了會極大影響TCP Server性能的關鍵問題。

11.2.1 epoll模型

Linux epoll是一個效率很高的I/O事件通告機制。下面說明一下TCP是如何使用epoll完成對I/O事件的監控的。使用epoll的模型示例：

int main(void)
{
    struct epoll_event  ev,events[20] = {};
    int                 fd;
    int                 listenfd;
    int                 sockfd;
    int                 nfds;
    int                 i;
    int                 rfd;
    ssize_t             rlen;
    ssize_t             wlen;

    listenfd = socket(AF_INET, SOCK_STREAM, 0);
    ...
    bind(listenfd, serveraddr, serveraddrlen);
    ...
    listen(listenfd, 10);
    ...

    epfd = epoll_create(256);

    ev.data.fd = listenfd;
    ev.events = EPOLLIN;
    epoll_ctl(epfd, EPOLL_CTL_ADD, listenfd, &ev); //監控listenfd上發生的I/O事件

    while (1) {
        nfds = epoll_wait(epfd, events, 20, -1); //等待事件通告，當沒有通告時進程睡眠，不佔用CPU；有事件通告時進程被喚醒，然後處理事件
        for(i = 0; i < nfds; ++i) {
            if (events[i].events & EPOLLIN) {
                if ((sockfd = events[i].data.fd) < 0) {
                    continue;
                }

                if (sockfd == listenfd) { //1類socket的可讀事件發生，即新的連接請求到來
                    printf("Registered vm has changed!\n");
                    sockfd = accept(listenfd, clientaddr, clientaddrlen); //接受請求，產生新的socket描述符
                    ...
                } else { //2類socket的可讀事件發生
                    rlen = read(sockfd, buf, sizeof(buf));
                }
                ...
                ev.data.fd = sockfd;
                ev.events = EPOLLIN|EPOLLET;
                epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev);
                ...
            } else if(events[i].events & EPOLLOUT) {    //可寫事件到來，即告知進程sockfd可以發送數據
                sockfd = events[i].data.fd;
                wlen = write(sockfd, data, data_len);
                if (0 <= wlen && wlen < n) {    //當有數據無法發送完畢時，可以定製可寫事件通告，使epoll在得知sockfd可發送數據時通知進程
                    ev.data.fd = sockfd;
                    ev.events = EPOLLOUT|EPOLLET;
                    epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev);
                }
                ...
            }
            ...
        }
    }

    ...

    return 0;
}

epoll_create用於產生一個epoll的文件描述符，一個epoll文件描述符對應一個文件描述符集合。

epoll_ctl用於控制這個集合中的成員（加入、刪除、變更定製事件等）。在內核中，epoll_ctl會將新的fd加入到一顆紅黑樹中加以管理。

epoll_wait用於等待集合中成員的I/O事件發生；如果所有成員都沒有I/O事件，則保持進程的睡眠狀態；否則，進程會被喚醒，epoll_wait會返回所有發生的事件的信息。下面重點研究epoll是如何使進程睡眠，在有I/O事件時內核又如何喚醒進程的。

11.2.1 epoll_ctl內核代碼

epoll_ctl內核代碼如下：

1788 SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1789         struct epoll_event __user *, event)
1790 {
1791     int error;
1792     int did_lock_epmutex = 0;
1793     struct file *file, *tfile;
1794     struct eventpoll *ep;
1795     struct epitem *epi;  
1796     struct epoll_event epds;
1797
1798     error = -EFAULT;     
1799     if (ep_op_has_event(op) &&
1800         copy_from_user(&epds, event, sizeof(struct epoll_event)))
1801         goto error_return;
1802
1803     /* Get the "struct file *" for the eventpoll file */
1804     error = -EBADF;      
1805     file = fget(epfd);   
1806     if (!file)
1807         goto error_return;
1808
1809     /* Get the "struct file *" for the target file */
1810     tfile = fget(fd);    
1811     if (!tfile)
1812         goto error_fput;
1813
1814     /* The target file descriptor must support poll */
1815     error = -EPERM;
1816     if (!tfile->f_op || !tfile->f_op->poll)
1817         goto error_tgt_fput;
1818
1819     /* Check if EPOLLWAKEUP is allowed */
1820     if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
1821         epds.events &= ~EPOLLWAKEUP;   
1822
1823     /*
1824      * We have to check that the file structure underneath the file descriptor
1825      * the user passed to us _is_ an eventpoll file. And also we do not permit
1826      * adding an epoll file descriptor inside itself.
1827      */
1828     error = -EINVAL;
1829     if (file == tfile || !is_file_epoll(file))
1830         goto error_tgt_fput;
1831
1832     /*
1833      * At this point it is safe to assume that the "private_data" contains
1834      * our own data structure.
1835      */
1836     ep = file->private_data;
...
1850     if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1851         mutex_lock(&epmutex);
1852         did_lock_epmutex = 1;
1853     }
1854     if (op == EPOLL_CTL_ADD) {
1855         if (is_file_epoll(tfile)) {
1856             error = -ELOOP;
1857             if (ep_loop_check(ep, tfile) != 0) {
1858                 clear_tfile_check_list();
1859                 goto error_tgt_fput;
1860             }
1861         } else
1862             list_add(&tfile->f_tfile_llink, &tfile_check_list);
1863     }
1864
1865     mutex_lock_nested(&ep->mtx, 0);
1866
1867     /*
1868      * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
1869      * above, we can be sure to be able to use the item looked up by
1870      * ep_find() till we release the mutex.
1871      */
1872     epi = ep_find(ep, tfile, fd);
1873
1874     error = -EINVAL;
1875     switch (op) {
1876     case EPOLL_CTL_ADD:    //添加
1877         if (!epi) {
1878             epds.events |= POLLERR | POLLHUP;
1879             error = ep_insert(ep, &epds, tfile, fd);
1880         } else
1881             error = -EEXIST;
1882         clear_tfile_check_list();
1883         break;
1884     case EPOLL_CTL_DEL:    //刪除
1885         if (epi)
1886             error = ep_remove(ep, epi);
1887         else
1888             error = -ENOENT;
1889         break;
1890     case EPOLL_CTL_MOD:    //修改
1891         if (epi) {
1892             epds.events |= POLLERR | POLLHUP;
1893             error = ep_modify(ep, epi, &epds);
1894         } else
1895             error = -ENOENT;
1896         break;
1897     }
1898     mutex_unlock(&ep->mtx);
1899
1900 error_tgt_fput:
1901     if (did_lock_epmutex)
1902         mutex_unlock(&epmutex);
1903
1904     fput(tfile);
1905 error_fput:
1906     fput(file);
1907 error_return:
1908
1909     return error;
1910 }

一個socket調用epoll_ctl加入epoll的文件描述符集合時，會調用ep_insert函數：

1231 static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1232              struct file *tfile, int fd)
1233 {
1234     int error, revents, pwake = 0;
1235     unsigned long flags;
1236     long user_watches;
1237     struct epitem *epi;
1238     struct ep_pqueue epq;
1239
1240     user_watches = atomic_long_read(&ep->user->epoll_watches);
1241     if (unlikely(user_watches >= max_user_watches))
1242         return -ENOSPC;
1243     if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
1244         return -ENOMEM;
1245
1246     /* Item initialization follow here ... */
1247     INIT_LIST_HEAD(&epi->rdllink);
1248     INIT_LIST_HEAD(&epi->fllink);
1249     INIT_LIST_HEAD(&epi->pwqlist);
1250     epi->ep = ep;
1251     ep_set_ffd(&epi->ffd, tfile, fd);    //將sockfd對應的file結構體指針賦予epi->ffd.file
1252     epi->event = *event;
1253     epi->nwait = 0;
1254     epi->next = EP_UNACTIVE_PTR;
1255     if (epi->event.events & EPOLLWAKEUP) {
1256         error = ep_create_wakeup_source(epi);
1257         if (error)
1258             goto error_create_wakeup_source;
1259     } else {
1260         RCU_INIT_POINTER(epi->ws, NULL);
1261     }
1262
1263     /* Initialize the poll table using the queue callback */
1264     epq.epi = epi;
1265     init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); //將epq.pt._qproc指向ep_ptable_queue_proc
1266
1267     /*
1268      * Attach the item to the poll hooks and get current event bits.
1269      * We can safely use the file* here because its usage count has
1270      * been increased by the caller of this function. Note that after
1271      * this operation completes, the poll callback can start hitting
1272      * the new item.
1273      */
1274     revents = ep_item_poll(epi, &epq.pt);  //使用epi->ffd.file->f_op->poll指向的函數查詢I/O事件，對於這個函數爲sock_poll，sock_poll又會調用tcp_poll
1275
1276     /*
1277      * We have to check if something went wrong during the poll wait queue
1278      * install process. Namely an allocation for a wait queue failed due
1279      * high memory pressure.
1280      */
1281     error = -ENOMEM;
1282     if (epi->nwait < 0)
1283         goto error_unregister;
1284
1285     /* Add the current item to the list of active epoll hook for this file */
1286     spin_lock(&tfile->f_lock);
1287     list_add_tail(&epi->fllink, &tfile->f_ep_links);
1288     spin_unlock(&tfile->f_lock);
1289
1290     /*
1291      * Add the current item to the RB tree. All RB tree operations are
1292      * protected by "mtx", and ep_insert() is called with "mtx" held.
1293      */
1294     ep_rbtree_insert(ep, epi);
1295
1296     /* now check if we've created too many backpaths */
1297     error = -EINVAL;
1298     if (reverse_path_check())
1299         goto error_remove_epi;
1300
1301     /* We have to drop the new item inside our item list to keep track of it */
1302     spin_lock_irqsave(&ep->lock, flags);
1303
1304     /* If the file is already "ready" we drop it inside the ready list */
1305     if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
1306         list_add_tail(&epi->rdllink, &ep->rdllist); //如果有定製的I/O事件發生，則將epi結構體加入到ep的rdllist中
1307         ep_pm_stay_awake(epi);
1308
1309         /* Notify waiting tasks that events are available */
1310         if (waitqueue_active(&ep->wq))
1311             wake_up_locked(&ep->wq);    //如果有等待事件通告的進程，喚醒之
1312         if (waitqueue_active(&ep->poll_wait))
1313             pwake++;
1314     }
1315
1316     spin_unlock_irqrestore(&ep->lock, flags);
1317
1318     atomic_long_inc(&ep->user->epoll_watches);
1319
1320     /* We have to call this outside the lock */
1321     if (pwake)
1322         ep_poll_safewake(&ep->poll_wait);    //如果有定製了epfd I/O事件的進程，則喚醒
1323
1324     return 0;
1325
1326 error_remove_epi:
1327     spin_lock(&tfile->f_lock);
1328     if (ep_is_linked(&epi->fllink))
1329         list_del_init(&epi->fllink);
1330     spin_unlock(&tfile->f_lock);
1331
1332     rb_erase(&epi->rbn, &ep->rbr);
1333
1334 error_unregister:
1335     ep_unregister_pollwait(ep, epi);
1336
1337     /*
1338      * We need to do this because an event could have been arrived on some
1339      * allocated wait queue. Note that we don't care about the ep->ovflist
1340      * list, since that is used/cleaned only inside a section bound by "mtx".
1341      * And ep_insert() is called with "mtx" held.
1342      */
1343     spin_lock_irqsave(&ep->lock, flags);
1344     if (ep_is_linked(&epi->rdllink))
1345         list_del_init(&epi->rdllink);
1346     spin_unlock_irqrestore(&ep->lock, flags);
1347
1348     wakeup_source_unregister(ep_wakeup_source(epi));
1349
1350 error_create_wakeup_source:
1351     kmem_cache_free(epi_cache, epi);
1352
1353     return error;
1354 }

tcp_poll函數：

 433 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 434 {
 435     unsigned int mask;
 436     struct sock *sk = sock->sk;
 437     const struct tcp_sock *tp = tcp_sk(sk);
 438
 439     sock_poll_wait(file, sk_sleep(sk), wait);//會通過函數指針調用之前設置的ep_ptable_queue_proc函數
 440     if (sk->sk_state == TCP_LISTEN)
 441         return inet_csk_listen_poll(sk);    //返回處於listen的socket的可讀事件，即有新連接到來
 442
 443     /* Socket is not locked. We are protected from async events
 444      * by poll logic and correct handling of state changes
 445      * made by other threads is impossible in any case.
 446      */
 447
 448     mask = 0;
...
 477     if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 478         mask |= POLLHUP;
 479     if (sk->sk_shutdown & RCV_SHUTDOWN)
 480         mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 481
 482     /* Connected or passive Fast Open socket? */
 483     if (sk->sk_state != TCP_SYN_SENT &&
 484         (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk != NULL)) {
 485         int target = sock_rcvlowat(sk, 0, INT_MAX);
 486
 487         if (tp->urg_seq == tp->copied_seq &&
 488             !sock_flag(sk, SOCK_URGINLINE) &&
 489             tp->urg_data)
 490             target++;
 491
 492         /* Potential race condition. If read of tp below will
 493          * escape above sk->sk_state, we can be illegally awaken
 494          * in SYN_* states. */
 495         if (tp->rcv_nxt - tp->copied_seq >= target)
 496             mask |= POLLIN | POLLRDNORM;    //有數據未讀，返回可讀事件
 497
 498         if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 499             if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 500                 mask |= POLLOUT | POLLWRNORM; //socket有空間可寫，返回可寫事件
 501             } else {  /* send SIGIO later */
 502                 set_bit(SOCK_ASYNC_NOSPACE,
 503                     &sk->sk_socket->flags);
 504                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 505
 506                 /* Race breaker. If space is freed after
 507                  * wspace test but before the flags are set,
 508                  * IO signal will be lost.
 509                  */
 510                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))    //發送緩存有空間
 511                     mask |= POLLOUT | POLLWRNORM;
 512             }
 513         } else
 514             mask |= POLLOUT | POLLWRNORM;
 515
 516         if (tp->urg_data & TCP_URG_VALID)    //有緊急數據可讀
 517             mask |= POLLPRI;
 518     }
 519     /* This barrier is coupled with smp_wmb() in tcp_reset() */
 520     smp_rmb();
 521     if (sk->sk_err)
 522         mask |= POLLERR; //出現錯誤
 523
 524     return mask;
 525 }

ep_ptable_queue_proc函數：

1058 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
1059                  poll_table *pt)
1060 {
1061     struct epitem *epi = ep_item_from_epqueue(pt);//找到ep_inster中申請的struct epitem *epi
1062     struct eppoll_entry *pwq;
1063
1064     if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
1065         init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);//將pwq->wait.func設置爲ep_poll_callback
1066         pwq->whead = whead;
1067         pwq->base = epi;
1068         add_wait_queue(whead, &pwq->wait);//將pwq結構體加入到sk_sleep(sk)的等待隊列中
1069         list_add_tail(&pwq->llink, &epi->pwqlist); //將pwq結構體加入到epi的pwqlist中
1070         epi->nwait++;    
1071     } else {
1072         /* We have to signal that an error occurred */
1073         epi->nwait = -1;
1074     }
1075 }

epoll_ctl返回後，進程會調用epoll_wait函數。

11.2.3 epoll_wait內核代碼

epoll_wait內核代碼如下：

1916 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1917         int, maxevents, int, timeout)
1918 {
1919     int error;
1920     struct fd f;
1921     struct eventpoll *ep;
1922
1923     /* The maximum number of event must be greater than zero */
1924     if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1925         return -EINVAL;  
1926
1927     /* Verify that the area passed by the user is writeable */
1928     if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
1929         return -EFAULT;  
1930
1931     /* Get the "struct file *" for the eventpoll file */
1932     f = fdget(epfd);     
1933     if (!f.file)
1934         return -EBADF;   
1935
1936     /*
1937      * We have to check that the file structure underneath the fd
1938      * the user passed to us _is_ an eventpoll file.
1939      */
1940     error = -EINVAL;
1941     if (!is_file_epoll(f.file))        
1942         goto error_fput;
1943
1944     /*
1945      * At this point it is safe to assume that the "private_data" contains
1946      * our own data structure.
1947      */
1948     ep = f.file->private_data;
1949
1950     /* Time to fish for events ... */
1951     error = ep_poll(ep, events, maxevents, timeout);    //核心函數
1952
1953 error_fput:
1954     fdput(f);
1955     return error;
1956 }

分析核心函數ep_poll：

1553 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1554            int maxevents, long timeout)
1555 {
1556     int res = 0, eavail, timed_out = 0;
1557     unsigned long flags;
1558     long slack = 0;      
1559     wait_queue_t wait;   
1560     ktime_t expires, *to = NULL;
1561
1562     if (timeout > 0) {   
1563         struct timespec end_time = ep_set_mstimeout(timeout);
1564
1565         slack = select_estimate_accuracy(&end_time);
1566         to = &expires;   
1567         *to = timespec_to_ktime(end_time);
1568     } else if (timeout == 0) {
1569         /*
1570          * Avoid the unnecessary trip to the wait queue loop, if the
1571          * caller specified a non blocking operation.
1572          */
1573         timed_out = 1;   
1574         spin_lock_irqsave(&ep->lock, flags);
1575         goto check_events;
1576     }
1577
1578 fetch_events:
1579     spin_lock_irqsave(&ep->lock, flags);
1580
1581     if (!ep_events_available(ep)) {    //當ep->rdllist隊列不爲空時，ep_events_available(ep)爲真
1582         /*
1583          * We don't have any available event to return to the caller.
1584          * We need to sleep here, and we will be wake up by
1585          * ep_poll_callback() when events will become available.
1586          */
1587         init_waitqueue_entry(&wait, current);   
1588         __add_wait_queue_exclusive(&ep->wq, &wait);//加入到ep的wq等待隊列中等待被喚醒
1589
1590         for (;;) {
1591             /*
1592              * We don't want to sleep if the ep_poll_callback() sends us
1593              * a wakeup in between. That's why we set the task state
1594              * to TASK_INTERRUPTIBLE before doing the checks.
1595              */
1596             set_current_state(TASK_INTERRUPTIBLE);
1597             if (ep_events_available(ep) || timed_out) //如果有事件通告或超時
1598                 break;
1599             if (signal_pending(current)) {
1600                 res = -EINTR;
1601                 break;
1602             }
1603
1604             spin_unlock_irqrestore(&ep->lock, flags);
1605             if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))//在此處執行schedule放棄CPU，直到超時或被喚醒時才能再次被調度
1606                 timed_out = 1;
1607
1608             spin_lock_irqsave(&ep->lock, flags);
1609         }
1610         __remove_wait_queue(&ep->wq, &wait);
1611
1612         set_current_state(TASK_RUNNING);
1613     }
1614 check_events:
1615     /* Is it worth to try to dig for events ? */
1616     eavail = ep_events_available(ep);
1617
1618     spin_unlock_irqrestore(&ep->lock, flags);
1619
1620     /*
1621      * Try to transfer events to user space. In case we get 0 events and
1622      * there's still timeout left over, we go trying again in search of
1623      * more luck.
1624      */
1625     if (!res && eavail &&
1626         !(res = ep_send_events(ep, events, maxevents)) && !timed_out) //將所有要通告的事件返回給用戶態
1627         goto fetch_events;
1628
1629     return res;
1630 }

當一個socket有I/O事件到來時，以可讀事件爲例，內核會調用sk->sk_data_ready，這個指針指向sock_def_readable：

2157 static void sock_def_readable(struct sock *sk, int len)
2158 {
2159     struct socket_wq *wq;
2160
2161     rcu_read_lock();     
2162     wq = rcu_dereference(sk->sk_wq);
2163     if (wq_has_sleeper(wq)) //如果有進程需要事件通告，則喚醒之
2164         wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2165                         POLLRDNORM | POLLRDBAND);
2166     sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2167     rcu_read_unlock();   
2168 }

wake_up_interruptible_sync_poll封裝了__wake_up_sync_key，__wake_up_sync_key調用__wake_up_common：

3159 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3160             int nr_exclusive, int wake_flags, void *key)
3161 {
3162     wait_queue_t *curr, *next;
3163
3164     list_for_each_entry_safe(curr, next, &q->task_list, task_list) { //遍歷等待隊列中所有節點
3165         unsigned flags = curr->flags;
3166
3167         if (curr->func(curr, mode, wake_flags, key) &&
3168                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) //調用各個節點設置的喚醒函數，對於epoll這個函數是在ep_ptable_queue_proc中設置的ep_poll_callback
3169             break;   
3170     }
3171 }

ep_poll_callback函數：

969 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
 970 {
 971     int pwake = 0;
 972     unsigned long flags;
 973     struct epitem *epi = ep_item_from_wait(wait);    //找到ep_ptable_queue_proc設置的epi
 974     struct eventpoll *ep = epi->ep;
 975
 976     if ((unsigned long)key & POLLFREE) {
 977         ep_pwq_from_wait(wait)->whead = NULL;
 978         /*
 979          * whead = NULL above can race with ep_remove_wait_queue()
 980          * which can do another remove_wait_queue() after us, so we
 981          * can't use __remove_wait_queue(). whead->lock is held by
 982          * the caller.   
 983          */
 984         list_del_init(&wait->task_list);
 985     }
 986
 987     spin_lock_irqsave(&ep->lock, flags);
 988
 989     /*
 990      * If the event mask does not contain any poll(2) event, we consider the
 991      * descriptor to be disabled. This condition is likely the effect of the
 992      * EPOLLONESHOT bit that disables the descriptor when an event is received,
 993      * until the next EPOLL_CTL_MOD will be issued.
 994      */
 995     if (!(epi->event.events & ~EP_PRIVATE_BITS))
 996         goto out_unlock;
 997
 998     /*
 999      * Check the events coming with the callback. At this stage, not
1000      * every device reports the events in the "key" parameter of the
1001      * callback. We need to be able to handle both cases here, hence the
1002      * test for "key" != NULL before the event match test.
1003      */
1004     if (key && !((unsigned long) key & epi->event.events))
1005         goto out_unlock;
1006
1007     /*
1008      * If we are transferring events to userspace, we can hold no locks
1009      * (because we're accessing user memory, and because of linux f_op->poll()
1010      * semantics). All the events that happen during that period of time are
1011      * chained in ep->ovflist and requeued later on.
1012      */
1013     if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
1014         if (epi->next == EP_UNACTIVE_PTR) {
1015             epi->next = ep->ovflist;
1016             ep->ovflist = epi;
1017             if (epi->ws) {
1018                 /*
1019                  * Activate ep->ws since epi->ws may get
1020                  * deactivated at any time.
1021                  */
1022                 __pm_stay_awake(ep->ws);
1023             }
1024
1025         }
1026         goto out_unlock;
1027     }
1028
1029     /* If this file is already in the ready list we exit soon */
1030     if (!ep_is_linked(&epi->rdllink)) {    //如果這個epi還沒有被加入到別的epfd中,則將其加入到當前epfd的rdllist中
1031         list_add_tail(&epi->rdllink, &ep->rdllist);
1032         ep_pm_stay_awake_rcu(epi);
1033     }
1034
1035     /*
1036      * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1037      * wait list.
1038      */
1039     if (waitqueue_active(&ep->wq))
1040         wake_up_locked(&ep->wq);    //喚醒等待事件通告的進程，即喚醒在ep_poll的1588行加入等待隊列的進程
1041     if (waitqueue_active(&ep->poll_wait))
1042         pwake++;
1043
1044 out_unlock:
1045     spin_unlock_irqrestore(&ep->lock, flags);
1046
1047     /* We have to call this outside the lock */
1048     if (pwake)
1049         ep_poll_safewake(&ep->poll_wait);
1050
1051     return 1;
1052 }

喚醒進程後，ep_poll會調用ep_send_events將發生的事件收集到一起返回給用戶態：

1514 static int ep_send_events(struct eventpoll *ep,
1515               struct epoll_event __user *events, int maxevents)
1516 {
1517     struct ep_send_events_data esed;
1518
1519     esed.maxevents = maxevents;
1520     esed.events = events;
1521
1522     return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
1523 }

ep_scan_ready_list中會調用ep_send_events_proc函數：

1434 static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1435                    void *priv)
1436 {
1437     struct ep_send_events_data *esed = priv;
1438     int eventcnt;
1439     unsigned int revents;
1440     struct epitem *epi;  
1441     struct epoll_event __user *uevent;
1442     struct wakeup_source *ws;
1443     poll_table pt;
1444
1445     init_poll_funcptr(&pt, NULL);
1446
1447     /*
1448      * We can loop without lock because we are passed a task private list.
1449      * Items cannot vanish during the loop because ep_scan_ready_list() is
1450      * holding "mtx" during this call.
1451      */
1452     for (eventcnt = 0, uevent = esed->events;
1453          !list_empty(head) && eventcnt < esed->maxevents;) {//遍歷已發生的事件的隊列
1454         epi = list_first_entry(head, struct epitem, rdllink);
1455
1456         /*
1457          * Activate ep->ws before deactivating epi->ws to prevent
1458          * triggering auto-suspend here (in case we reactive epi->ws
1459          * below).
1460          *
1461          * This could be rearranged to delay the deactivation of epi->ws
1462          * instead, but then epi->ws would temporarily be out of sync
1463          * with ep_is_linked().
1464          */
1465         ws = ep_wakeup_source(epi);
1466         if (ws) {
1467             if (ws->active)
1468                 __pm_stay_awake(ep->ws);
1469             __pm_relax(ws);
1470         }
1471
1472         list_del_init(&epi->rdllink);
1473
1474         revents = ep_item_poll(epi, &pt);//調用socket中設定的poll函數（tcp_poll）查看是否有用戶關心的事件發生
1475
1476         /*
1477          * If the event mask intersect the caller-requested one,
1478          * deliver the event to userspace. Again, ep_scan_ready_list()
1479          * is holding "mtx", so no operations coming from userspace
1480          * can change the item.
1481          */       
1482         if (revents) {//如果有用戶關心的事件發生，則將世界信息copy回位於用戶態的epoll_event數組中
1483             if (__put_user(revents, &uevent->events) ||
1484                 __put_user(epi->event.data, &uevent->data)) {
1485                 list_add(&epi->rdllink, head);
1486                 ep_pm_stay_awake(epi);
1487                 return eventcnt ? eventcnt : -EFAULT;
1488             }
1489             eventcnt++;
1490             uevent++;
1491             if (epi->event.events & EPOLLONESHOT)
1492                 epi->event.events &= EP_PRIVATE_BITS;
1493             else if (!(epi->event.events & EPOLLET)) {
1494                 /*
1495                  * If this file has been added with Level
1496                  * Trigger mode, we need to insert back inside
1497                  * the ready list, so that the next call to
1498                  * epoll_wait() will check again the events
1499                  * availability. At this point, no one can insert
1500                  * into ep->rdllist besides us. The epoll_ctl()
1501                  * callers are locked out by
1502                  * ep_scan_ready_list() holding "mtx" and the
1503                  * poll callback will queue them in ep->ovflist.
1504                  */
1505                 list_add_tail(&epi->rdllink, &ep->rdllist);
1506                 ep_pm_stay_awake(epi);
1507             }
1508         }
1509     }
1510
1511     return eventcnt;
1512 }

epoll_wait返回後，進程就可以遍歷epoll_event數組來查詢發生了哪些事件。

wake_up_locked封裝了__wake_up_locked函數，__wake_up_locked封裝了__wake_up_common函數。這次在__wake_up_common函數中curr->func指向的函數是在ep_poll的1587行調用的init_waitqueue_entry中設置的default_wake_function，它會調用try_to_wake_up函數喚醒進程。在__wake_up_common中遍歷ep->wq的等待隊列時，第一個節點的喚醒函數執行完畢後就會退出循環，以避免驚羣效應。這時，同時等待一個epfd中I/O事件的進程中會有一個被喚醒，執行相應的操作，然後繼續調用epoll_wait等待事件通告，直到再次被喚醒。

綜上所述，事件通知的函數調用過程爲：

sock_def_readable->wake_up_interruptible_sync_poll->__wake_up_sync_key->__wake_up_common->
ep_poll_callback->wake_up_locked->__wake_up_locked->__wake_up_common->default_wake_function->try_to_wake_up

整個事件通知機制中，epoll相當於一個聯絡員，它監聽各個socket；socket有事件發生時通知epoll，epoll再去喚醒關心socket事件的進程。當有多於一個進程關心socket的I/O事件時，它們會在epoll中排隊，而不是在socket的等待隊列中排隊。有事件通告時epoll只喚醒隊列中的一個進程，其它進程會再下次事件到來時被喚醒。

Remy1119

發佈了79 篇原創文章 · 獲贊 46 · 訪問量 22萬+

私信關注

11.2 epoll系統調用

11.2.1 epoll模型

11.2.1 epoll_ctl內核代碼

11.2.3 epoll_wait內核代碼

【安裝部署】Apache SeaTunnel 和 Web快速安裝詳解

一個.NET開源的功能豐富、靈活易用的 Windows 窗口增強神器

11.3 TCP內核同步

10.2 發送緩存管理

9.6 堅持（Persist）定時器

OpenSSL-Async mode

12.2 擁塞控制簡介

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結