深入epoll技術原理分析

閱讀之前請先了解Linux內核的wakeup&callback機制以及前文的select與poll技術分析

epoll技術

爲了解決select&poll技術存在的兩個性能問題,對於大內存數據拷貝問題,epoll通過epoll_create函數創建epoll空間(相當於一個容器管理),在內核中只存儲一份數據來維護N個socket事件的變化,通過epoll_ctl函數來實現對socket事件的增刪改操作,並且在內核底層使用虛擬內存的管理方式保證用戶空間與內核空間對該內存是具備可見性,直接通過指針引用的方式進行操作,避免了大內存數據的拷貝導致的空間切換性能問題,對於輪詢等待事件通過epoll_wait的方式來實現對socket事件的監聽,將不斷輪詢等待高頻事件wait與低頻socket註冊事件兩個操作分離開,同時會對監聽就緒的socket事件添加到就緒隊列中,也就保證喚醒輪詢的事件都是具備可讀的,現對epoll技術分析如下:

epoll技術定義

// 創建保存epoll文件描述符的空間,該空間也稱爲“epoll例程”
int epoll_create(int size);    // 使用鏈表,現在已經棄用
int epoll_create(int flag);    // 使用紅黑樹的數據結構

// epoll註冊/修改/刪除 fd的操作
long epoll_ctl(int epfd,                        // 上述epoll空間的fd索引值
               int op,                         // 操作識別，EPOLL_CTL_ADD |  EPOLL_CTL_MOD  |  EPOLL_CTL_DEL
               int fd,                          // 註冊的fd
               struct epoll_event *event);      // epoll監聽事件的變化
struct epoll_event {
	__poll_t events;
	__u64 data;
} EPOLL_PACKED;

// epoll等待，與select/poll的邏輯一致
epoll_wait(int epfd,                            // epoll空間
           struct epoll_event *events,           // epoll監聽事件的變化
           int maxevents,                        // epoll可以保存的最大事件數
        int timeout);                         // 超時時間

epoll技術實現細節

epoll_ctl函數處理socket描述符fd註冊問題,關注epoll_ctl的ADD方法

// 摘取核心代碼
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
		 bool nonblock)
{
	// ...

	// 在紅黑樹中查找存儲file對應的epitem，添加的時候會將epitem加到紅黑樹節點中
	epi = ep_find(ep, tf.file, fd);
	
	// 對於EPOLL_CTL_ADD模式,使用mtx加鎖添加到wakeup隊列中
	switch (op) {
	case EPOLL_CTL_ADD:
	    // fd註冊操作
		// epds->events |= EPOLLERR | EPOLLHUP;
		// error = ep_insert(ep, epds, tf.file, fd, full_check);
		break;
	case EPOLL_CTL_DEL:
	    // // 刪除操作：存儲epitem容器移除epitem信息
		break;
	// 對註冊的fd進行修改,但epoll的模式爲EPOLLEXCLUSIVE是無法進行操作的
	case EPOLL_CTL_MOD:
	    // 修改操作,內核監聽到事件變化執行修改
            //error = ep_modify(ep, epi, epds);			
		break;
	}
	
	// 釋放資源邏輯
}

EPOLL_CTL_ADD核心代碼邏輯

// 添加邏輯
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
		     struct file *tfile, int fd, int full_check)
{
    // ...
	struct epitem *epi;
	struct ep_pqueue epq;
	
	// 將fd包裝在epitem的epollfile中
	epi->ep = ep;
	ep_set_ffd(&epi->ffd, tfile, fd);
	epi->event = *event;
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
	
	// 如果當前監聽到事件變化，那麼創建wakeup執行的source
	if (epi->event.events & EPOLLWAKEUP) {
		error = ep_create_wakeup_source(epi);
		if (error)
			goto error_create_wakeup_source;
	} else {
		RCU_INIT_POINTER(epi->ws, NULL);
	}

	// 初始化回調函數並與當前的epitem進行綁定添並將callback添加到poll table中，每一個epitem都有對應的callback，並添加到等待隊列ep_pqueue
	// 注意這裏每次添加fd都會有對應的epitem以及ep_pqueue，而一個ep_pqueue與一個callback進行綁定
	epq.epi = epi;
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

       // 輪詢檢測epitem中的事件，在內部會對epoll空間的ready list進行掃描，會觸發執行回調函數並將已經喚醒就緒的fdset從epoll空間中拷貝到
       // epitem中的ready_list
	revents = ep_item_poll(epi, &epq.pt, 1);

       // 將epitem插入到紅黑樹中	
	ep_rbtree_insert(ep, epi);

	// 如果有ready_list 則執行喚醒邏輯wakeup，這個是linux內核的喚醒機制，會將read_process添加到就緒隊列中讓cpu調度執行
	if (revents && !ep_is_linked(epi)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);
		ep_pm_stay_awake(epi);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

    // ....	

	// 存在預喚醒，則喚醒輪詢等待節點
	if (pwake)
	    ep_poll_safewake(&ep->poll_wait);

	return 0;

// goto statement code ...
}

上述的代碼中存在兩個核心邏輯(註冊&喚醒邏輯)

// 隊列回調添加註冊邏輯 ep_ptable_queue_proc
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt)
{
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;

	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
		pwq->whead = whead;
		pwq->base = epi;
		if (epi->event.events & EPOLLEXCLUSIVE)
			add_wait_queue_exclusive(whead, &pwq->wait);
		else
			add_wait_queue(whead, &pwq->wait);
		list_add_tail(&pwq->llink, &epi->pwqlist);
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

喚醒邏輯

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
				 int depth)
{
	struct eventpoll *ep;
	bool locked;

	pt->_key = epi->event.events;
	if (!is_file_epoll(epi->ffd.file))
		return vfs_poll(epi->ffd.file, pt) & epi->event.events;

	ep = epi->ffd.file->private_data;
	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
	locked = pt && (pt->_qproc == ep_ptable_queue_proc);

	return ep_scan_ready_list(epi->ffd.file->private_data,
				  ep_read_events_proc, &depth, depth,
				  locked) & epi->event.events;
}

// poll_wait還有這個方法
// 執行ep_ptable_queue_proc隊列回調函數
p->_qproc(filp, wait_address, p);



// ep_read_events_proc:監控ep空間事件是否可讀的回調方法
static __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
{
	struct epitem *epi, *tmp;
	poll_table pt;
	int depth = *(int *)priv;

	init_poll_funcptr(&pt, NULL);
	depth++;

	list_for_each_entry_safe(epi, tmp, head, rdllink) {
		if (ep_item_poll(epi, &pt, depth)) {
			return EPOLLIN | EPOLLRDNORM;
		} else {
			/*
			 * Item has been dropped into the ready list by the poll
			 * callback, but it's not actually ready, as far as
			 * caller requested events goes. We can remove it here.
			 */
			__pm_relax(ep_wakeup_source(epi));
			list_del_init(&epi->rdllink);
		}
	}

	return 0;
}

// 掃描方法
static __poll_t ep_scan_ready_list(struct eventpoll *ep,
			      __poll_t (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
			      void *priv, int depth, bool ep_locked)
{
	__poll_t res;
	struct epitem *epi, *nepi;
	LIST_HEAD(txlist);

	lockdep_assert_irqs_enabled();

	/*
	 * We need to lock this because we could be hit by
	 * eventpoll_release_file() and epoll_ctl().
	 */

	if (!ep_locked)
		mutex_lock_nested(&ep->mtx, depth);

	/*
	 * Steal the ready list, and re-init the original one to the
	 * empty list. Also, set ep->ovflist to NULL so that events
	 * happening while looping w/out locks, are not lost. We cannot
	 * have the poll callback to queue directly on ep->rdllist,
	 * because we want the "sproc" callback to be able to do it
	 * in a lockless way.
	 */
	write_lock_irq(&ep->lock);
	list_splice_init(&ep->rdllist, &txlist);
	WRITE_ONCE(ep->ovflist, NULL);
	write_unlock_irq(&ep->lock);

	/*
	 * Now call the callback function.
	 */
	res = (*sproc)(ep, &txlist, priv);

	write_lock_irq(&ep->lock);
	/*
	 * During the time we spent inside the "sproc" callback, some
	 * other events might have been queued by the poll callback.
	 * We re-insert them inside the main ready-list here.
	 */
	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
		/*
		 * We need to check if the item is already in the list.
		 * During the "sproc" callback execution time, items are
		 * queued into ->ovflist but the "txlist" might already
		 * contain them, and the list_splice() below takes care of them.
		 */
		if (!ep_is_linked(epi)) {
			/*
			 * ->ovflist is LIFO, so we have to reverse it in order
			 * to keep in FIFO.
			 */
			list_add(&epi->rdllink, &ep->rdllist);
			ep_pm_stay_awake(epi);
		}
	}
	/*
	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
	 * releasing the lock, events will be queued in the normal way inside
	 * ep->rdllist.
	 */
	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);

	/*
	 * Quickly re-inject items left on "txlist".
	 */
	list_splice(&txlist, &ep->rdllist);
	__pm_relax(ep->ws);
	write_unlock_irq(&ep->lock);

	if (!ep_locked)
		mutex_unlock(&ep->mtx);

	return res;
}

在上述的epoll_ctl技術代碼實現的細節中存在着兩個邏輯,即socket描述符註冊與喚醒邏輯,主要體現在兩個核心方法上,即ep_ptable_queue_proc & ep_item_poll對此分析如下：

註冊邏輯：
- 在epoll空間中創建一個epitem的中間層,初始化一系列epitem的屬性，同時將新增加的socket描述符包裝到epitem下的epoll_filefd中，同時添加喚醒任務wakeup，同時將epitem的內部ep容器指向epoll空間
- 其次在進行item事件的輪詢中，通過隊列回調的方式將epitem綁定到隊列節點entry上,並在entry節點上綁定epoll的回調函數來喚醒業務處理
- 最後是將epitem插入以epoll空間爲根節點的紅黑數中，後續內核可以通過fd查找到對應的epitem，通過epitem也就可以找到epoll空間引用
喚醒邏輯：
- 在item事件輪詢中，通過輪詢檢測epoll空間中的等待隊列是否有對應的節點entry可讀，如果有退出循環，並且從當前註冊的epitem開始輪詢遍歷查詢就緒的entry節點並將就緒entry節點的socket描述符添加到ready_list上
- 其次在上述註冊的邏輯之後，會檢查當前的epitem的ready list節點,如果存在ready_list,會將epoll空間的等待隊列喚醒,讓執行處理的read_process添加到就緒隊列中，讓cpu能夠進行調度
epoll_wait等待邏輯

// epoll_wait -> do_epoll_wait -> ep_poll, 我們關注核心方法ep_poll
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	// ...
	fetch_events: // 檢測epoll是否有事件就緒
		// ...
		for (;;) {
		// ...
		// 檢測當前ep空間是否有fd事件就緒
		eavail = ep_events_available(ep);
		if (eavail)
			// 是的跳出循環
			break;
		if (signal_pending(current)) {
			res = -EINTR;
			break;
		}
		// 執行休眠方法 schedule()
		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
			timed_out = 1;
			break;
		}
	}
	// ...
	send_events: // ep有事件就緒,將event_poll轉換到用戶空間中
		//...
		ep_send_events(ep, events, maxevents);
	
}

從上述可以看出等待處理邏輯主要有fetch_event以及send_events,現分析如下：
- 循環檢查當前epoll空間是否有就緒事件，如果有將跳出循環，如果沒有將執行schedule的方法進入休眠等待再次輪詢，原理與select/poll一致
- 其次當有就緒事件的時候,循環遍歷將監聽變化的事件拷貝到用戶空間中,並且會將就緒事件socket添加到epitem的就緒隊列ready_list上

最後基於上述的分析做一個分析小結

解決大內存且頻繁copy問題
- 首先,epoll通過epoll_create創建epoll空間,同時在創建空間的同時將epoll空間拷貝到內存中,此後epoll對socket描述的註冊監聽通過epoll空間來進行操作,僅一次拷貝
- 其次,epoll註冊將拆分爲ADD/MOD/DEL三個操作,分別只對相應的操作進行處理,大大降低頻繁調用的次數,相比select/poll機制，由原先高頻率的註冊等待轉換爲高頻等待，低頻註冊的處理邏輯
- 接着,還有一點就是每次註冊都通過建立一個epitem結構體對socket相關的fd以及file進行封裝，並且epitem的ep容器通過指針引用指向epoll空間,即每次新增加一個socket描述符的時候而是通過單個epitem進行操作，相比fdset較爲輕量級
解決只對就緒隊列進行喚醒循環遍歷
- 首先，我們可以看到在註冊的過程中，epoll通過epitem將socket描述符存儲到epoll_file中，同時將喚醒邏輯read_process也綁定到epitem，這樣當處於喚醒狀態就會被觸發執行，然後在以當前epitem存儲到隊列entry節點上，並entry節點綁定回調函數，最後將entry節點添加到ep的等待隊列上
- 其次，在進行wiat等待過程中，內核在執行file.poll()後會將等待隊列上的節點添加到輪詢等待中poll wait，處於半喚醒狀態，也就是當前是就緒狀態但還沒喚醒，同時會將喚醒的socket描述符添加到epoll空間的ready list中
- 接着，每當有一個item被喚醒的時候就會退出上述的輪詢遍歷並持續設置當前的item處於喚醒狀態，然後epoll空間開始遍歷item（單鏈表存儲）並執行回調函數通知，如果item爲就緒狀態，就將epoll空間的readylist拷貝到當前喚醒節點的epitem的ready list中
- 最後，會更新監聽變化的事件狀態，返回到用戶進程，用戶進程這個時候獲取到ready list中的描述符均爲可就緒狀態
epoll其他技術
- epoll支持併發執行，上述的休眠與喚醒邏輯都有加鎖操作
- 其次對於就緒狀態的ready_list是屬於無鎖操作，因此爲了保證執行併發的安全性在epoll使用的加鎖方式全局鎖

邊緣觸發與水平觸發

邊緣觸發與水平觸發定義

水平觸發
- socket接收數據的緩衝區不爲空的時候，則一直觸發讀事件，相當於"不斷地詢問是否數據是否可讀"
- socket發送數據的緩衝區不全滿的時候，則一直觸發寫事件，相當於"不斷地詢問是否有區域可以讓數據寫入"
  本質上就是一個不斷進行交流的過程，如下圖所示：
邊緣觸發
- socket接收數據的緩衝區發生變化，則觸發讀取事件，也就是當空的接收數據的socket緩衝區這個時候有數據傳送過來的時候觸發
- socket發送數據的緩衝區發生變化，則觸發寫入事件，也就是當滿的發送數據的socket緩衝區這個時候剛刷新數據初期的時候觸發
  本質上就是socket緩衝區變化而觸發，如下圖所示：
上述的觸發事件會調用epoll_wait方法，也就是
- 水平觸發會多次調用epoll_wait
- 邊緣觸發在socket緩衝區中不發生改變那麼就不會調用epoll_wait的方式

水平觸發與邊緣觸發代碼

// 默認爲水平觸發對應標誌爲EPOLLONESHOT, 邊緣觸發標誌爲EPOLLET
list_for_each_entry_safe(epi, tmp, head, rdllink) {
		if (esed->res >= esed->maxevents)
			break;
        
        // 執行喚醒邏輯
		ws = ep_wakeup_source(epi);
		if (ws) {
			if (ws->active)
				__pm_stay_awake(ep->ws);
			__pm_relax(ws);
		}

        // 移除epitem下的ready_list
		list_del_init(&epi->rdllink);

        
        // 重新輪詢事件收集就緒事件
		revents = ep_item_poll(epi, &pt, 1);
		if (!revents)
			continue;

		if (__put_user(revents, &uevent->events) ||
		    __put_user(epi->event.data, &uevent->data)) {
			list_add(&epi->rdllink, head);
			ep_pm_stay_awake(epi);
			if (!esed->res)
				esed->res = -EFAULT;
			return 0;
		}
		esed->res++;
		uevent++;
	    
	 
		if (epi->event.events & EPOLLONESHOT)
			epi->event.events &= EP_PRIVATE_BITS;
		else if (!(epi->event.events & EPOLLET)) {
			 // 水平觸發模式，需要重新添加到ready_list以便於調用epoll_wait的時候能夠檢查到事件可用
			list_add_tail(&epi->rdllink, &ep->rdllist);
			ep_pm_stay_awake(epi);
		}
	}

水平觸發：遍歷epoll下的等待隊列的每個entry，喚醒entry節點之後從ready_list移除當前socket事件，然後再輪詢當前item收集可用的事件，最後添加到ready_list以便於調用epoll_wait的時候能夠檢查到socket事件可用
邊緣觸發：遍歷epoll下的等待隊列的每個entry，喚醒entry節點之後從ready_list移除當前socket事件，再輪詢當前item收集可用的事件然後喚醒執行的業務處理read_process

深入epoll技術原理分析

epoll技術

邊緣觸發與水平觸發

10分鐘搞定Mysql主從部署配置

如何使用 JS 判斷用戶是否處於活躍狀態

「Pygors跨平臺GUI」2：安裝MinGW-w64、MSYS2還是WSL2

[轉帖]

python列出centos7內存使用前50的進程信息

「Pygors跨平臺GUI」1：Pygors跨平臺GUI應用研究

一鍵自動化博客發佈工具,用過的人都說好(掘金篇)

lightdb數據庫超時相關控制參數

lightdb秒級增加列和刪除列（not null帶默認值）

Java ThreadPoolShutdown

CPU高速緩存與內存屏障

synchronized基於JVM規範的工作原理(一)

Java內存模型之可見性分析

關鍵字volatile的使用與原子性問題

併發原子性技術之CAS機制

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結