linux epoll 源碼分析

本文章源碼基於kernel 5.5版本,主要分析epoll在kernel的實現原理,主要源碼在 kernel/fs/eventpoll.c。

目錄

一,關鍵結構體

二,epoll啓動

三,epoll_create

四,epoll_ctl

五, epoll_wait


一,關鍵結構體

關鍵結構體主要有以下2個需要留意,裏邊包含的成員用來幹嘛同學們看代碼應該看得懂,在此只是簡單提一下。

struct eventpoll ,struct epitem,這邊需要留意的主要就這2個結構體,eventpoll每個進程調epoll_create()後會各創建一個,裏邊有個rdlist用來存epitem,而epitem裏邊存的就是所監聽的描述符的信息,包括,fd,file,events等。

eventpoll裏邊是用一顆紅黑樹來存epitem的,所以查找,插入,刪除的最壞時間複雜度是O(log(n))。

 

二,epoll啓動

源碼位置 /fs/eventpoll.c

從這開始,
fs_initcall(eventpoll_init);


static int __init eventpoll_init(void)
{

	struct sysinfo si;

	si_meminfo(&si);//獲取系統內存信息
	/*
	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
	 */
	//設置每個進程能監聽的fd數目
	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
		EP_ITEM_COST;

//對於 EP_ITEM_COST,
//是由於每註冊一個fd需要用掉一個 struct epitem 和一個struct eppoll_entry

	BUG_ON(max_user_watches < 0);

	/*
	 * Initialize the structure used to perform epoll file descriptor
	 * inclusion loops checks.
	 */
	ep_nested_calls_init(&poll_loop_ncalls);//鏈表初始化

	/*
	 * We can have many thousands of epitems, so prevent this from
	 * using an extra cache line on 64-bit (and smaller) CPUs
	 */
	BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);

	//創建緩存,用來存 (struct epitem 和 struct eppoll_entry
    
	/* Allocates slab cache used to allocate "struct epitem" items */
	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

	/* Allocates slab cache used to allocate "struct eppoll_entry" */
	pwq_cache = kmem_cache_create("eventpoll_pwq",
		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);

	return 0;
}

好了,初始化完畢,接下來看 epoll_create
 

三,epoll_create

SYSCALL_DEFINE1(epoll_create, int, size)
{
	if (size <= 0)
		return -EINVAL;

	return do_epoll_create(0);
}

用SYSCALL_DEFINE定義了系統調用接口epoll_create,只是對size做了個兼容性的判斷,
具體工作在 do_epoll_create裏邊,接着看


/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
	int error, fd;
	struct eventpoll *ep = NULL;
	struct file *file;

	/* Check the EPOLL_* constant for consistency.  */
	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;
	/*
	 * Create the internal data structure ("struct eventpoll").
	 */
	error = ep_alloc(&ep);//針對這個進程創建一個 eventpoll(一般來說每個進程一個,畢竟咱們在一個進程裏只epoll_create一次嘛)
	if (error < 0)
		return error;
	/*
	 * Creates all the items needed to setup an eventpoll file. That is,
	 * a file structure and a free file descriptor.
	 */
	//獲取一個可用的文件描述符
	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
	if (fd < 0) {
		error = fd;
		goto out_free_ep;
	}
	//創建一個文件實例,將ep裝到其priv 裏邊,後面要用的時候通過該file拿出對應的eventpoll;
	//並註冊對該file的操作接口eventpoll_fops
	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_fd;
	}
	//將file裝到ep裏邊,後面也可以通過該ep拿出對應的file
	//至此,eventpoll和其對應的file形成你中有我我中有你的聯繫,每個進程調epoll_create 後擁有一個eventpoll,一個fd,一個file
	ep->file = file;

	//將fd和file掛鉤起來,文件系統那塊目前還沒有去研究源碼不太清楚,不過我覺得大概應該是之後可以通過
	//該fd操作file吧,比如你對該fd調用llseek,那麼後面就會調用前面註冊的eventpoll_fops::noop_llseek,具體實現就在這裏邊做
	fd_install(fd, file);
	return fd;//返回fd給用戶進程

out_free_fd:
	put_unused_fd(fd);
out_free_ep:
	ep_free(ep);
	return error;
}

這邊可以順便看下 創建eventpoll時做了什麼事情,
 

static int ep_alloc(struct eventpoll **pep)
{
	int error;
	struct user_struct *user;
	struct eventpoll *ep;

	user = get_current_user();//獲取當前用戶信息
	error = -ENOMEM;
	ep = kzalloc(sizeof(*ep), GFP_KERNEL);//開闢內存
	if (unlikely(!ep))
		goto free_uid;

	//鎖初始化
	mutex_init(&ep->mtx);
	rwlock_init(&ep->lock);
	//隊列初始化
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);

	//等待隊列 redy list 初始化
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT_CACHED;//紅黑樹初始化,用來存監聽的fd的信息
	ep->ovflist = EP_UNACTIVE_PTR;
	ep->user = user;//當前用戶信息存在eventpoll裏邊

	*pep = ep;

	return 0;

free_uid:
	free_uid(user);
	return error;
}


好了,epoll_create 看完了,接着看epoll_ctl,這邊咱們沿着添加描述的主線去看,
即 EPOLL_CTL_ADD 選項的行爲。

四,epoll_ctl

同樣的,epoll_ctl用SYSCALL_DEFINE定義爲系統調用,

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int full_check = 0;
	struct fd f, tf;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;
	struct eventpoll *tep = NULL;

	error = -EFAULT;
	//獲取用戶空間傳進來的event的內容,獲取要監聽fd的類型(讀,寫等)
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	error = -EBADF;
	f = fdget(epfd);//通過在epoll_create裏邊創建的fd(返回給用戶空間後又由用戶空間傳下來)獲取其對應的file
	if (!f.file)
		goto error_return;

	/* Get the "struct file *" for the target file */
	tf = fdget(fd);  //獲取該socket對應的file
	if (!tf.file)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
	//若該socket沒有實現poll接口,直接返回(socket實現poll是因爲後面這邊後面回調用該poll,在後面的epoll_wait分析裏邊就會遇到)
	if (!file_can_poll(tf.file))
		goto error_tgt_fput;

	/* Check if EPOLLWAKEUP is allowed */
	if (ep_op_has_event(op))
		ep_take_care_of_epollwakeup(&epds);

	/*
	 * We have to check that the file structure underneath the file descriptor
	 * the user passed to us _is_ an eventpoll file. And also we do not permit
	 * adding an epoll file descriptor inside itself.
	 */
	//把自己的fd傳進來了,不能這麼搞哦小夥子
	error = -EINVAL;
	if (f.file == tf.file || !is_file_epoll(f.file))
		goto error_tgt_fput;

    //......

	/*
	 * At this point it is safe to assume that the "private_data" contains
	 * our own data structure.
	 */
	ep = f.file->private_data;//將之前epoll_create創建的eventpoll取出來,
   //即: 用戶空間創efd下來,通過fd可以取得file,通過file再取出創建file時裝進去的eventpoll

    //......

	/*
	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
	 * above, we can be sure to be able to use the item looked up by
	 * ep_find() till we release the mutex.
	 */
	epi = ep_find(ep, tf.file, fd);//在該進程的紅黑樹裏邊找fd ,file對應的epitem,一邊來說第一次EPOLL_CTL_ADD找不到所以爲NULL

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= EPOLLERR | EPOLLHUP;
		//創建eptime,保存該socket 對應的fd,file,以及eventpoll,然後插入紅黑樹
			error = ep_insert(ep, &epds, tf.file, fd, full_check);
		} else
			error = -EEXIST;
		if (full_check)
			clear_tfile_check_list();
		break;

        //......
	
    }

    //......
	return error;
}

//所以其實epoll_ctl 添加監聽描述符時,主要做的工作是創建epitem,裏邊保存了該socket的fd,file,以及該進程的eventpoll。
(對於eventpoll,我在想有必要每個epitem都保存一個嗎?如果fd比較多的話,加上64位平臺,每個fd多用8字節內存,對於kernel
的內存還是有一定的開銷的,畢竟咱們每個進程一般只創建一個eventpoll,並且eventpoll指針也已經裝在epfd對應的file裏邊了呀(就是
epoll_create創建時裝的)所以我覺得要用的時候可以通過epfd獲取file再通過file->private_data獲取就可以了呢)


接下來看下如何創建epitem並插入紅黑樹的,以及在這裏邊幹了什麼

static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
	int error, pwake = 0;
	__poll_t revents;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;

	lockdep_assert_irqs_enabled();//什麼都沒有做,不知寫這個用來幹嘛

	//獲取該進程監聽的fd數目
	user_watches = atomic_long_read(&ep->user->epoll_watches);
	//超過能監聽的最大數目了
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;
		//創建一個epitem,從初始化時開闢的緩存裏邊拿內存來用
	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	//epitem初始化
	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;//裝eventpoll
	ep_set_ffd(&epi->ffd, tfile, fd);//裝socket的fd,file
	epi->event = *event;//存儲監聽事件類型
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
	if (epi->event.events & EPOLLWAKEUP) {
		error = ep_create_wakeup_source(epi);
		if (error)
			goto error_create_wakeup_source;
	} else {
		RCU_INIT_POINTER(epi->ws, NULL);
	}

	/* Initialize the poll table using the queue callback */
	epq.epi = epi;
	//這邊註冊回調,將callback函數裝到epq.pt._qproc裏邊
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	 */
	revents = ep_item_poll(epi, &epq.pt, 1);//這邊調用socket那邊的poll,後面得看下socket那邊的行爲(其實就是回調這個ep_ptable_queue_proc())

    //......

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_lock);
	//把 epi->fllink, 加到 tfile->f_ep_links裏邊,即添加到socket那邊的file的f_ep_links鏈表尾部
	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	//將剛剛創建的epitem插入在epoll_create創建的紅黑樹裏邊(每個進程各有一顆)
	ep_rbtree_insert(ep, epi);

    //......

	/* If the file is already "ready" we drop it inside the ready list */
	//如果該socket已經有事件的話,無需睡眠等待,
	//直接將其裝入eventpoll的redy list裏邊,然後通過wake_up(&ep->wq)調用 ep_poll_callback
	if (revents && !ep_is_linked(epi)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available *///TODO 這邊做了什麼事情後面再看下
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);//TODO
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	write_unlock_irq(&ep->lock);

    //這邊將監聽的fd數目加1
	atomic_long_inc(&ep->user->epoll_watches);


	return error;
}

該函數主要是創建一個epitem,調用socket那邊的poll接口,然後socket那邊會回調ep_ptable_queue_proc(),
 裝上eventpoll,socket的fd和file,然後插入紅黑樹,
若監聽的fd已經有事件的話,直接將該fd的信息裝入evnetpoll的redy list裏邊,然後調用callback函數 ep_poll_callback()。

好的,接下來看下 ep_item_poll()和 ep_ptable_queue_proc(),ep_poll_callback()等到分析socket那邊產生數據後
通知eventpoll這邊時再分析。

先看 ep_item_poll() 

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct eventpoll *ep;
	bool locked;

	pt->_key = epi->event.events;//用戶空間設置的參數,讀/寫/listend
	if (!is_file_epoll(epi->ffd.file))//走這邊
		return vfs_poll(epi->ffd.file, pt) & epi->event.events;//進入vfs_poll()
	
}

看下vfs_poll() ,在 include/linux/poll.h 裏邊
 

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
        if (unlikely(!file->f_op->poll))
                return DEFAULT_POLLMASK;
        return file->f_op->poll(file, pt);//調用文件系統socket 那邊的poll ,接下來得看socket那邊的poll,咱們就看tcp的吧,應該都差不多
}

接下來就到了socket的poll這邊了,在 net/socket.c裏邊

// 記得wait 裏邊裝有eventpoll那邊的callback函數ep_ptable_queue_proc()

static __poll_t sock_poll(struct file *file, poll_table *wait)
{
	struct socket *sock = file->private_data;

    //......

	return sock->ops->poll(file, sock, wait) | flag;
}

這邊轉而調用tcb那邊的poll,即 tcp_poll()那就接着看,源碼在 net/ipv4/tcp.c裏邊,
 

__poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
	__poll_t mask;
	struct sock *sk = sock->sk;
	const struct tcp_sock *tp = tcp_sk(sk);
	int state;

	sock_poll_wait(file, sock, wait);//這邊主要是這個,跟進去看下

    //......

	return mask;
}

static inline void sock_poll_wait(struct file *filp, struct socket *sock,
				  poll_table *p)
{
	if (!poll_does_not_wait(p)) {
        //sock->wq.wait 留意這個,後面有事件的時候可能會用到
		poll_wait(filp, &sock->wq.wait, p); //這邊再跟進去看
        //......
	}
}

源碼位置 /include/linux/poll.h
 

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);//這邊就去回調eventpoll的 ep_ptable_queue_proc()了
        //將socket的file,該socket的等待隊列傳給ep_ptable_queue_proc()
}

好了,接下來看下 回到 eventpoll.c 看 ep_ptable_queue_proc()
 

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,//whead 是socket那邊傳過來的
				 poll_table *pt)
{
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;

	//這邊也是針對這個socket fd創建一個eppoll_entry
	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		//這邊註冊喚醒回調函數,由ep_poll_call來執行喚醒函數,代替default_wake_function()喚醒
		//
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);// 這邊將ep_poll_callback掛載到eppoll_entry::wait裏邊,
		//當socket那邊有數據時,會統一用__wake_up()喚醒whead,
		//而對於 咱們這個pwq->wait,會調用 ep_poll_callback,然後再在callback裏邊去喚醒應用層調用epoll_wait的用戶進程
		pwq->whead = whead;//等待隊列存socket的等待隊列,
		pwq->base = epi;
		if (epi->event.events & EPOLLEXCLUSIVE)
			add_wait_queue_exclusive(whead, &pwq->wait);
		else
			add_wait_queue(whead, &pwq->wait); //這邊將eppoll_entry的等待隊列節點添加到socket的等待隊列上去
		list_add_tail(&pwq->llink, &epi->pwqlist);///將eppoll_entry 添加到 epitem的pwqlist鏈表裏邊,後面就可以根據這個鏈表取出
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

 

這邊主要做的工作其實就是創建一個eppoll_entry,裝上ep_poll_callback()函數,然後註冊到socket的事件等待隊列裏邊,
後面socket那邊有事件後就會喚醒這個隊列的所有節點。
然後將eppoll_entry添加到eptiem裏邊,TODO 爲何要添加目前還沒有理清楚,後面得看下

好了,至此,epoll_ctl分析完了,接下來看epoll_wait
 

五, epoll_wait

同樣的,epoll_wait用SYSCALL_DEFINE定義爲系統調用,
 

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
		int, maxevents, int, timeout)
{
	return do_epoll_wait(epfd, events, maxevents, timeout);
}

//接着看 do_epoll_wait(),
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
			 int maxevents, int timeout)
{
	int error;
	struct fd f;
	struct eventpoll *ep;

	//......

	/* Get the "struct file *" for the eventpoll file */
	f = fdget(epfd);//通過epfd拿到eventpoll的file對象
	if (!f.file)
		return -EBADF;

	//......
	
	ep = f.file->private_data;
	//從file->private_data裏邊取出對應的eventpoll,還記得嗎,這是在epoll_create的時候存進去


	/* Time to fish for events ... */
	//跟進去看
	error = ep_poll(ep, events, maxevents, timeout);
	//......
}

好接下來看ep_poll(),
 

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res = 0, eavail, timed_out = 0;
	u64 slack = 0;
	bool waiter = false;
	wait_queue_entry_t wait;
	ktime_t expires, *to = NULL;


	//有設置超時的話換算一下時間
	if (timeout > 0) {
		struct timespec64 end_time = ep_set_mstimeout(timeout);

		slack = select_estimate_accuracy(&end_time);
		to = &expires;
		*to = timespec64_to_ktime(end_time);
	} else if (timeout == 0) {

		timed_out = 1;

		write_lock_irq(&ep->lock);
		eavail = ep_events_available(ep);//檢查是否此時已經有事件就緒了,如果eventpoll::rdlist不爲空就是有事件了
		write_unlock_irq(&ep->lock);

		goto send_events;
		//這邊先直接跳到 send_events 看是否已經有事件就緒,有的話就直接將事件返回給用戶空間了,
		//沒有事件的話纔回跳到fetch_events這邊把自己調度出去
	}

fetch_events:

	//......

	/*
	 * We don't have any available event to return to the caller.  We need
	 * to sleep here, and we will be woken by ep_poll_callback() when events
	 * become available.
	 */
	//沒有事件,需要把自己調度出去,先睡會兒,等有事件後socket那邊回調ep_poll_callback(),那邊會喚醒這個進程
	if (!waiter) {
		waiter = true;
		init_waitqueue_entry(&wait, current);//當前進程結構體,添加到wait節點裏邊

		spin_lock_irq(&ep->wq.lock);
		__add_wait_queue_exclusive(&ep->wq, &wait);//把wait節點添加到等待隊列 wq裏邊,
		spin_unlock_irq(&ep->wq.lock);
	}

	for (;;) {

		set_current_state(TASK_INTERRUPTIBLE);//將進程設爲TASK_INTERRUPTIBLE狀態,可喚醒

		//.......

		eavail = ep_events_available(ep);
		if (eavail)//若此時rdlist不爲空那就不用睡了,直接跳出去
			break;
		//.......

		//這邊把把自己調度出去,後面醒來後又回到for循環的開始處,在	if (eavail)這邊再跳出去
		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
			timed_out = 1;//若是超時發生,則直接跳出去
			break;
		}
	}

	__set_current_state(TASK_RUNNING);//醒來後,把自己設爲運行態

send_events:
	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	if (!res && eavail &&
	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)//這邊把事件和fd給到用戶空間並返回
		goto fetch_events;

	if (waiter) {
		spin_lock_irq(&ep->wq.lock);
		__remove_wait_queue(&ep->wq, &wait);//把wait從ep->wq裏邊拿出來
		spin_unlock_irq(&ep->wq.lock);
	}

	return res;
}

 

這邊所做的事情主要是,先判斷有沒有事件已經就緒(判斷rdlist是否爲空),若有就緒則直接將
事件和fd返回給用戶空間,沒有就緒的話先把自己調度出去,然後等待socket那邊有事件後通過
回調 ep_poll_callback()來喚醒。

好的,至此,在咱們看ep_send_events()如何將事件給到用戶空間之前,
先看下socket那邊有事件的行爲,如何回調ep_poll_callback(),以及ep_poll_callback()做了什麼事情,
接下來再過頭來看ep_send_events().scoket那邊咱們就挑tcp ipv4的來看吧,

對於一個tcp socket,比如當網卡那邊有數據來臨時,觸發中斷,經過鏈路層,ip層報頭處理後,到了tcp這邊,報頭處理後,
會調用sock_def_readable(),咱們直接看下這個函數,
//y源碼位置 net/core/sock.c

static void sock_def_readable(struct sock *sk)
{
	struct socket_wq *wq;

	rcu_read_lock();
	wq = rcu_dereference(sk->sk_wq);
	
	//在這邊,會喚醒所有該socket的等待隊列裏邊的節點,即監聽該socket的進程,
	//對於epoll就是在前邊epoll_ctl 添加監聽描述符的時候添加進來的,
	if (skwq_has_sleeper(wq))
		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
						EPOLLRDNORM | EPOLLRDBAND);
	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
	rcu_read_unlock();
}

對於 wake_up_interruptible_sync_poll(),具體實現在 /kernel/sched/wait.c   __wake_up_common()裏邊,
若隊列裏邊的節點有實現自己的喚醒函數,先回調該函數,再由該函數來執行喚醒工作,
若沒有實現的話,使用默認的kernel/sched/core.c   default_wake_function(),

對於eventpoll,自己實現了咱們提了好多次的ep_poll_callback(),好了,接下來看下這個函數實現,

/*
 * This is the callback that is passed to the wait queue wakeup
 * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 */
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	struct epitem *epi = ep_item_from_wait(wait);//從wait裏邊取出對應的epitem
	struct eventpoll *ep = epi->ep;//從epitem裏邊取出對應的eventpoll
	__poll_t pollflags = key_to_poll(key);
	unsigned long flags;
	int ewake = 0;

	 //.......

	/* If this file is already in the ready list we exit soon */
	//這邊把epitem::rdllink 添加到 rdllist裏邊
	if (!ep_is_linked(epi) &&
	    list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
		ep_pm_stay_awake_rcu(epi);
	}

	/*
	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
	 * wait list.
	 */

	if (waitqueue_active(&ep->wq)) {
		if ((epi->event.events & EPOLLEXCLUSIVE) &&
					!(pollflags & POLLFREE)) {
			switch (pollflags & EPOLLINOUT_BITS) {
			case EPOLLIN:
				if (epi->event.events & EPOLLIN)
					ewake = 1;
				break;
			case EPOLLOUT:
				if (epi->event.events & EPOLLOUT)
					ewake = 1;
				break;
			case 0:
				ewake = 1;
				break;
			}
		}
		wake_up(&ep->wq);//這邊去喚醒用戶進程
	}

	//......
	return ewake;
}

 

該函數主要做了2個工作,將該socket對應的eptiem添加到eventpoll的rdlist裏邊,
若對應的用戶進程此時還在睡,通過wake_up() 喚醒,
好了,此時eventpoll::rdlist裏邊已經保存了該socket的事件信息,用戶進程(準確的說該進程此時還是內核態)也喚醒了,

咱們可以回到ep_poll()了,這邊接下來就到了 ep_send_events(),
繼續看, 

static int ep_send_events(struct eventpoll *ep,
			  struct epoll_event __user *events, int maxevents)
{
	struct ep_send_events_data esed;

	esed.maxevents = maxevents;
	esed.events = events;//保存用戶空間傳進來的epoll_event地址

	ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);//跟進去
	return esed.res;
}

接下來看ep_scan_ready_list()
 

/**
 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
 *                      the scan code, to call f_op->poll(). Also allows for
 *                      O(NumReady) performance.
 */
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
			      __poll_t (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
			      void *priv, int depth, bool ep_locked)
				  //priv 存有用戶傳進來的events,
{
	__poll_t res;
	struct epitem *epi, *nepi;
	LIST_HEAD(txlist);

	//......

	/*
	 * Steal the ready list, and re-init the original one to the
	 * empty list. Also, set ep->ovflist to NULL so that events
	 * happening while looping w/out locks, are not lost. We cannot
	 * have the poll callback to queue directly on ep->rdllist,
	 * because we want the "sproc" callback to be able to do it
	 * in a lockless way.
	 */
	//將rdlist賦值給txlist,rdlist初始化爲空鏈表
	write_lock_irq(&ep->lock);
	list_splice_init(&ep->rdllist, &txlist);
	WRITE_ONCE(ep->ovflist, NULL);
	write_unlock_irq(&ep->lock);

	/*
	 * Now call the callback function.
	 */
	res = (*sproc)(ep, &txlist, priv);//好了,接下來主要是這個,即 ep_send_events_proc(ep, &txlist, priv)

	//......

	return res;
}

 

看下 ep_send_events_proc()
 

static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
{
	struct ep_send_events_data *esed = priv;//指針強轉重新獲得ep_send_events_data,裏邊裝有用戶空間傳進來的epoll_event地址,
	__poll_t revents;
	struct epitem *epi, *tmp;
	struct epoll_event __user *uevent = esed->events;//取出用戶空間的epoll_event地址,接下來主要圍繞這個進行操作
	struct wakeup_source *ws;
	poll_table pt;

	//.......

	//這邊掃描整個rdlist鏈表
	list_for_each_entry_safe(epi, tmp, head, rdllink) {
		if (esed->res >= esed->maxevents)
			break;

	    //.......

		//把事件和fd裝到用戶空間的epoll_event
		if (__put_user(revents, &uevent->events) ||
		    __put_user(epi->event.data, &uevent->data)) {
			list_add(&epi->rdllink, head);
			ep_pm_stay_awaake(epi);
			if (!esed->res)
				esed->res = -EFAULT;
			return 0;
		}
		//......

	return 0;
}

這邊主要做的事情就是掃描整個rdlist(不是此時的eventpoll::rdlist,該list已經是空的了,在ep_scan_ready_list()裏邊置換給txlist
後傳進來),然後將事件和fd裝到用戶空間的epoll_event裏邊,然後返回。

好了,到了這邊,就返回到用戶空間(epoll_wait返回),給應用層去處理了。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章