epoll機制相比select/poll機制能更有效地實現描述符的多路複用(支持更多的描述符,處理效率更高[具體機制這邊不展開了]),本文從編程的角度做一個介紹。
epoll接口函數
頭文件: #include<sys/epoll.h>
可以通過man epoll查看對應的幫助信息
最大描述符限制:/proc/sys/fs/epoll/max_user_watches
創建epoll實例
int epoll_create(int size);
int epoll_create1(int flag);
epoll_create中,size只是給內核的一個維數提示,並不是隊列中的最大數,Now days, size is ignored since Linux 2.6.8
epoll_create1中flag取值如下:
- 0:epoll_create1 == epoll_create (size argument is dropped)
- EPOLL_CLOEXEC:含義同open函數的O_CLOEXEC選項;當執行execve創建新進程時,打開的描述符自動關閉
p.s: 當使用完畢時,需要調用close關閉epoll實例句柄
管理epoll事件
int epoll_ctl (int epfd, int op, int fd, struct epoll_event *event);
參數說明:
- epfd: epoll_create返回的epoll實例
- op: 對應的操作
- fd: 監聽的fd
- event: 監聽的事件
其中op取值如下: - EPOLL_CTL_ADD:添加監聽的事件
- EPOLL_CTL_DEL:刪除監聽的事件
- EPOLL_CTL_MOD:修改監聽的事件
struct epoll_event定義如下:
typedef union epoll_data
{
void *ptr;
int fd;
uint32_t u32;
uint64_t u64;
} epoll_data_t;
struct epoll_event
{
uint32_t events; /* Epoll events */
epoll_data_t data; /* User data variable */
};
其中events可以包含以下事件類型:
- EPOLLIN: 描述符可讀
- EPOLLOUT: 描述符可寫
- EPOLLRDHUP(since Linux 2.6.17): 流套接字對端關閉連接或者關閉寫端
- EPOLLPRI: 緊急數據可讀
- EPOLLERR: 描述符發生錯誤,該事件由內核一直監聽(比如connect套接字失敗會返回EPOLLERR)
- EPOLLHUP: 文件秒殺符被中斷,該事件由內核一直監聽
- EPOLLET: 開啓邊緣觸發,默認是水平觸發
- EPOLLONESHOT: 一個事件發生並讀取之後,fd自動不再監控;若要重新監控需要使用EPOLL_CTL_MOD重新設置
返回值: 成功返回0,失敗返回-1並設置errno
等待epoll事件
int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
參數說明:
- epfd: epoll_create返回的epoll實例
- events: 存儲epoll_event的數組地址
- maxevents: 最大事件的數量,需>0
- timeout: 等待的最長時間
返回值:
成功時返回就緒的監聽文件描述符數;當超出timeout指定的時間後如果無就緒的文件描述符,返回0;發生錯誤時返回-1並設置errno
另外,Linux kernel 2.6.19 引入了epoll_pwait,可以在等待時設置信號掩碼,其使用方式類似pselect
some problems:
- epoll 怎麼判斷是connect請求還是有數據可讀?
ans: 判斷events[i].data.fd == listen_fd - read 返回值說明:
- return -1 and errno == EAGAIN: 數據已經讀完,沒有可讀數據
- return 0: end of file,對端關閉連接
關於水平觸發(Level-Triggered)和邊緣觸發(Edge-Triggered)
當緩衝區有數據可讀時,ET會觸發一次事件,之後就不會再觸發;而LT只要我們沒有讀完緩衝區的數據,事件就會一直觸發。
推薦使用的epoll ET方式如下:
- 設置fd爲非阻塞
- 當調用read或write讀寫時,在其返回-1,且errno == EAGAIN 後再調用epoll_wait等待
tips:
ET模式只能用於設置了O_NONBLOCK的fd,而LT則同時支持同步及異步。如果將ET模式應用與阻塞情況,將出現如下問題:
當對端send 2 byte數據,而服務端只讀取了1 byte後再去調用epoll_wait,這時將不產生讀事件。直到對端又有數據發送過來,epoll_wait纔會再次返回
補充:
Q:當又有事件產生時會怎麼樣,原來的數據還在嗎?
A:原來的數據還在socket緩衝區
epoll實例
epoll使用參考:
- 服務端代碼:How to use epoll? A complete example in C, it’s a well write paper.
static int
create_and_bind (char *port)
{
struct addrinfo hints;
struct addrinfo *result, *rp;
int s, sfd;
memset (&hints, 0, sizeof (struct addrinfo));
hints.ai_family = AF_UNSPEC; /* Return IPv4 and IPv6 choices */
hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */
hints.ai_flags = AI_PASSIVE; /* All interfaces */
s = getaddrinfo (NULL, port, &hints, &result);
if (s != 0)
{
fprintf (stderr, "getaddrinfo: %s\n", gai_strerror (s));
return -1;
}
for (rp = result; rp != NULL; rp = rp->ai_next)
{
sfd = socket (rp->ai_family, rp->ai_socktype, rp->ai_protocol);
if (sfd == -1)
continue;
s = bind (sfd, rp->ai_addr, rp->ai_addrlen);
if (s == 0)
{
/* We managed to bind successfully! */
break;
}
close (sfd);
}
if (rp == NULL)
{
fprintf (stderr, "Could not bind\n");
return -1;
}
freeaddrinfo (result);
return sfd;
}
static int
make_socket_non_blocking (int sfd)
{
int flags, s;
flags = fcntl (sfd, F_GETFL, 0);
if (flags == -1)
{
perror ("fcntl");
return -1;
}
flags |= O_NONBLOCK;
s = fcntl (sfd, F_SETFL, flags);
if (s == -1)
{
perror ("fcntl");
return -1;
}
return 0;
}
#define MAXEVENTS 64
int
main (int argc, char *argv[])
{
int sfd, s;
int efd;
struct epoll_event event;
struct epoll_event *events;
if (argc != 2)
{
fprintf (stderr, "Usage: %s [port]\n", argv[0]);
exit (EXIT_FAILURE);
}
sfd = create_and_bind (argv[1]);
if (sfd == -1)
abort ();
s = make_socket_non_blocking (sfd);
if (s == -1)
abort ();
s = listen (sfd, SOMAXCONN);
if (s == -1)
{
perror ("listen");
abort ();
}
efd = epoll_create1 (0);
if (efd == -1)
{
perror ("epoll_create");
abort ();
}
event.data.fd = sfd;
event.events = EPOLLIN | EPOLLET;
s = epoll_ctl (efd, EPOLL_CTL_ADD, sfd, &event);
if (s == -1)
{
perror ("epoll_ctl");
abort ();
}
/* Buffer where events are returned */
events = calloc (MAXEVENTS, sizeof event);
/* The event loop */
while (1)
{
int n, i;
n = epoll_wait (efd, events, MAXEVENTS, -1);
for (i = 0; i < n; i++)
{
if ((events[i].events & EPOLLERR) ||
(events[i].events & EPOLLHUP) ||
(!(events[i].events & EPOLLIN)))
{
/* An error has occured on this fd, or the socket is not
ready for reading (why were we notified then?) */
fprintf (stderr, "epoll error\n");
close (events[i].data.fd);
continue;
}
else if (sfd == events[i].data.fd)
{
/* We have a notification on the listening socket, which
means one or more incoming connections. */
while (1)
{
struct sockaddr in_addr;
socklen_t in_len;
int infd;
char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
in_len = sizeof in_addr;
infd = accept (sfd, &in_addr, &in_len);
if (infd == -1)
{
if ((errno == EAGAIN) ||
(errno == EWOULDBLOCK))
{
/* We have processed all incoming
connections. */
break;
}
else
{
perror ("accept");
break;
}
}
s = getnameinfo (&in_addr, in_len,
hbuf, sizeof hbuf,
sbuf, sizeof sbuf,
NI_NUMERICHOST | NI_NUMERICSERV);
if (s == 0)
{
printf("Accepted connection on descriptor %d "
"(host=%s, port=%s)\n", infd, hbuf, sbuf);
}
/* Make the incoming socket non-blocking and add it to the
list of fds to monitor. */
s = make_socket_non_blocking (infd);
if (s == -1)
abort ();
event.data.fd = infd;
event.events = EPOLLIN | EPOLLET;
s = epoll_ctl (efd, EPOLL_CTL_ADD, infd, &event);
if (s == -1)
{
perror ("epoll_ctl");
abort ();
}
}
continue;
}
else
{
/* We have data on the fd waiting to be read. Read and
display it. We must read whatever data is available
completely, as we are running in edge-triggered mode
and won't get a notification again for the same
data. */
int done = 0;
while (1)
{
ssize_t count;
char buf[512];
count = read (events[i].data.fd, buf, sizeof buf);
if (count == -1)
{
/* If errno == EAGAIN, that means we have read all
data. So go back to the main loop. */
if (errno != EAGAIN)
{
perror ("read");
done = 1;
}
break;
}
else if (count == 0)
{
/* End of file. The remote has closed the
connection. */
done = 1;
break;
}
/* Write the buffer to standard output */
s = write (1, buf, count);
if (s == -1)
{
perror ("write");
abort ();
}
}
if (done)
{
printf ("Closed connection on descriptor %d\n",
events[i].data.fd);
/* Closing the descriptor will make epoll remove it
from the set of descriptors which are monitored. */
close (events[i].data.fd);
}
}
}
}
free (events);
close (sfd);
return EXIT_SUCCESS;
}
- 客戶端代碼:
#include <stdio.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netdb.h>
#include <string.h>
#include <stdlib.h>
int create_and_connect(char * port)
{
if(NULL == port)
{
return -1;
}
struct addrinfo hints;
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
hints.ai_flags = AI_PASSIVE;
struct addrinfo *result;
int ret = getaddrinfo(NULL, port, &hints, &result);
if(ret != 0)
{
fprintf(stderr, "getaddrinfo error: %s\n", gai_strerror(ret));
return -1;
}
struct addrinfo *rp;
int cfd;
for(rp = result; rp != NULL; rp = rp->ai_next)
{
cfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
if(-1 == cfd)
{
continue;
}
//client connect
ret = connect(cfd, rp->ai_addr, rp->ai_addrlen);
if(0 == ret)
{
break;
}
close(cfd);
}
if(NULL == rp)
{
fprintf(stderr, "connect to port failed!\n");
return -1;
}
freeaddrinfo(result);
return cfd;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "Usage: %s [port]\n", argv[0]);
exit(-1);
}
//clinet send something
int cfd = create_and_connect(argv[1]);
if(-1 == cfd)
{
fprintf(stderr, "create_and_connect failed\n");
return -1;
}
char *pData = "Client hello!";
int dataLen = strlen(pData);
send(cfd, pData, dataLen, 0);
sleep(1);
send(cfd, pData, dataLen, 0);
return 0;
}
mac下的epoll
mac os不支持epoll,其使用kqueue實現(類似epoll),頭文件 sys/event.h
link:https://zhuanlan.zhihu.com/p/21375144
epoll源碼實現
參考博客:Linux epoll 詳解