從epoll源碼分析它的使用

首先來看看epoll_create的真身

SYSCALL_DEFINE1(epoll_create, int, size)
{
    if (size <= 0)
    return -EINVAL;
    //也就是說參數size根本用不上
    return sys_epoll_create1(0);
}

再來看看epoll_create1的真身

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
    int error, fd;
    struct eventpoll *ep = NULL;
    struct file *file;

    BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
    if (flags & ~EPOLL_CLOEXEC)
        return -EINVAL;

    error = ep_alloc(&ep);
    if (error < 0)
        return error;

    fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
    if (fd < 0) {
        error = fd;
        goto out_free_ep;
    }
    file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC));
    if (IS_ERR(file)) {
        error = PTR_ERR(file);
        goto out_free_fd;
    }
    ep->file = file;
    fd_install(fd, file);
    return fd;

out_free_fd:
    put_unused_fd(fd);
out_free_ep:
    ep_free(ep);
    return error;
}

1. 對epoll來講，目前唯一有效的flag只有EPOLL_CLOEXEC
2. ep_alloc初始化spinlock_t鎖，mutex鎖
3. 每次epoll_create1一個epollfd，內核就會分配一個eventpoll 與之對應
struct eventpoll
{
spinlock_t lock;

//添加，修改，刪除fd，epoll_wait返回，內核態向用戶態傳遞數據時都會持有這個鎖，所以多線程操作epoll是安全的，內核做了保護
struct mutex mtx;

/* Wait queue used by sys_epoll_wait()*/
wait_queue_head_t wq;

/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;

//所有觸發的epitem都放在這個鏈表裏面
struct list_head rdllist;

//紅黑樹的root節點，所有要監聽的epitem都在這個紅黑樹中，我們可以把紅黑樹的所有節點都看作epitem
struct rb_root rbr;

/*
* This is a single linked list that chains all the “struct epitem” that
* happened while transferring ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist;

/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;

/* The user that created the eventpoll descriptor */
struct user_struct *user;

struct file *file;

/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
3. 因爲epollfd本身不存在一個真正的文件與之對應，不像socket，所以內核會分配一個真正的file結構且有真正的fd，然後和epollfd對應
struct file{
//eventpoll存儲在這裏
void *private_data;
struct list_head f_ep_links;
};
這樣，通過epollfd找到它在內核中的file，然後通過file找到了存儲的eventpoll
4. struct epitem {
/* RB tree node used to link this structure to the eventpoll RB tree */
struct rb_node rbn;

//當這個節點觸發的時候，會鏈到之前提到的eventpoll中的rdllist中去
struct list_head rdllink;

/*
* Works together “struct eventpoll”->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;

//epitem對應的fd和真正的file
struct epoll_filefd ffd;

/* Number of active wait queue attached to poll operations */
int nwait;

/* List containing poll wait queues */
struct list_head pwqlist;

//epitem屬於的eventpoll
struct eventpoll *ep;

/* List header used to link this item to the “struct file” items list */
struct list_head fllink;

/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source *ws;

/* The structure that describe the interested events and the source fd */
//epitem關心的事件
struct epoll_event event;
};
struct epoll_filefd{
struct file *file;
int fd;
};
再來看看epoll_ctl的真身

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epoll_event __user *, event)
{
    int error;
    int did_lock_epmutex = 0;
    struct file *file, *tfile;
    struct eventpoll *ep;
    struct epitem *epi;
    struct epoll_event epds;

    error = -EFAULT;
    if (ep_op_has_event(op) &&
        copy_from_user(&epds, event, sizeof(struct epoll_event)))
        goto error_return;

    /* Get the "struct file *" for the eventpoll file */
    error = -EBADF;
    //這裏就是之前說的通過epollfd找到對應的file，後續會通過這個file找到eventpoll
    file = fget(epfd);
    if (!file)
        goto error_return;

    /* Get the "struct file *" for the target file */
    tfile = fget(fd);
    if (!tfile)
        goto error_fput;

    /* The target file descriptor must support poll */
    error = -EPERM;
    if (!tfile->f_op || !tfile->f_op->poll)
        goto error_tgt_fput;

    /* Check if EPOLLWAKEUP is allowed */
    if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
        epds.events &= ~EPOLLWAKEUP;

    /*
     * We have to check that the file structure underneath the file descriptor
     * the user passed to us _is_ an eventpoll file. And also we do not permit
     * adding an epoll file descriptor inside itself.
     */
    error = -EINVAL;
    //epoll不能監聽自己
    if (file == tfile || !is_file_epoll(file))
        goto error_tgt_fput;

    /*
     * At this point it is safe to assume that the "private_data" contains
     * our own data structure.
     */
     //這裏就是通過file找到對應的eventpoll
    ep = file->private_data;

    /*
     * When we insert an epoll file descriptor, inside another epoll file
     * descriptor, there is the change of creating closed loops, which are
     * better be handled here, than in more critical paths. While we are
     * checking for loops we also determine the list of files reachable
     * and hang them on the tfile_check_list, so we can check that we
     * haven't created too many possible wakeup paths.
     *
     * We need to hold the epmutex across both ep_insert and ep_remove
     * b/c we want to make sure we are looking at a coherent view of
     * epoll network.
     */
    if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
        mutex_lock(&epmutex);
        did_lock_epmutex = 1;
    }
    if (op == EPOLL_CTL_ADD) {
        if (is_file_epoll(tfile)) {
            error = -ELOOP;
            if (ep_loop_check(ep, tfile) != 0) {
                clear_tfile_check_list();
                goto error_tgt_fput;
            }
        } else
            list_add(&tfile->f_tfile_llink, &tfile_check_list);
    }

    mutex_lock_nested(&ep->mtx, 0);

    /*
     * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
     * above, we can be sure to be able to use the item looked up by
     * ep_find() till we release the mutex.
     */
     //我們在接口層面知道一個fd只能添加一次，這裏對應到紅黑樹中是epitem
    epi = ep_find(ep, tfile, fd);

    error = -EINVAL;
    switch (op) {
    case EPOLL_CTL_ADD:
        if (!epi) {
            epds.events |= POLLERR | POLLHUP;
            error = ep_insert(ep, &epds, tfile, fd);
        } else
            error = -EEXIST;
        clear_tfile_check_list();
        break;
    case EPOLL_CTL_DEL:
        if (epi)
            error = ep_remove(ep, epi);
        else
            error = -ENOENT;
        break;
    case EPOLL_CTL_MOD:
        if (epi) {
            epds.events |= POLLERR | POLLHUP;
            error = ep_modify(ep, epi, &epds);
        } else
            error = -ENOENT;
        break;
    }
    mutex_unlock(&ep->mtx);

error_tgt_fput:
    if (did_lock_epmutex)
        mutex_unlock(&epmutex);

    fput(tfile);
error_fput:
    fput(file);
error_return:

    return error;
}

這裏我們可以很清楚的看到EPOLL_CTL_ADD，EPOLL_CTL_DEL，EPOLL_CTL_MOD操作都是有加鎖保護的，ep_insert使用了spinlock_t 鎖，內部首先是查看eventpoll中user成員，查看給的最大監聽數量，然後再分配一個epitem，並設置回調ep_ptable_queue_proc,也就是紅黑樹的節點epitem有事件觸發就調用這個回調。這個回調將觸發的epitem放到waitqueue中,並設置了回調ep_poll_callback，這個waitqueue是fd所持有的。然後這個回調內部將觸發的epitem放到了之前說的eventpoll的rdllist中。最後我們的epoll_wait就是遍歷這個rdllist，如果有事件觸發，就開始從內核態拷貝數據給用戶態，這裏也使用了spinlock_t鎖。拷貝完之後的操作，在這裏還設置了ET和LT的區別，如果是ET，epitem是不會再進入到rdllist，除非fd再次發生了狀態改變，ep_poll_callback被調用。如果是LT，不管你還有沒有激活的事件或者有效的數據，都會被重新插入到rdllist，再下一次epoll_wait的時候又返回給你。
總結：
1. 我們不是一定非要在主線程中listen之後完成accept，recv然後把數據丟給工作線程池。因爲在多線程中EPOLL_CTL_ADD，EPOLL_CTL_DEL，EPOLL_CTL_MOD都是安全的，我們完全可以讓線程池來代替主線程做accep，recv，當然這個線程池應該是CPU密集的，數量最好是CPU核數。這樣主線程只做一件事情監聽就行了，連接管理就交給這個線程池來做，最後數據處理還是給工作線程池。
2. 對比select，每次調用select時都要把fd集合從用戶態拷貝到內核態，每次都要重複拷貝，而epoll只是在EPOLL_CTL_ADD調用了一次，也就是隻拷貝了一次
3. 對比select，每次調用select的返回都需要在內核遍歷傳進來的fd集合，而epoll內部是通過紅黑樹結構查找速度更快，並且觸發的事件都會通過回調函數放到rdllist，而epoll_wait返回僅僅只是從rdllist拿已經觸發的事件。select和epoll都會睡眠和喚醒的狀態切換，但是select在喚醒的時候需要去遍歷，而epoll只需要判斷鏈表是否爲空，也節約了CPU消耗
4. 對比select，select支持的文件描述符默認是1024，就算修改配置後面遍歷的速度也會越來越慢沒有紅黑樹快。而epoll支持的文件描述符是一個進程能夠打開的最大文件描述符數目1G內存大概可以提供10萬
5. 聯繫著名的“驚羣”現象，多線程中epoll_wait會不會因爲同一個fd的事件觸發而觸發了多個線程去處理？由於epoll_wait從rdllist拿事件是加鎖了的，所以不會。

從epoll源碼分析它的使用

python使用原始套接字發送二層包（鏈路層幀）

URL安全的Base64算法

linux c使用system調用shell腳本

源碼安裝pip

python使用epoll實現的服務端例子

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結