netpoller
背景介紹
I/O多路複用模型(I/O Multiplexing):
select
阻塞,直到有FD準備好,FD數量有FD_SETSIZE
限制
int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
struct timeval *timeout);
// readfds,writefds,exceptfds 需要檢查的FDs
// nfds 比最大的FD大1(減少內核比較次數)
// timeout 最大等待時間
// fd_set 位掩碼錶示FD集合
poll
和select類似,在傳遞FD方式上不同
int poll(struct pollfd fds[], nfds_t nfds, int timeout);
// fds 需要檢查的FDs和事件(events,revents),結果寫入fds
// nfds fds中的FD數量
// timeout 最大等待時間
struct pollfd {
int fd; /* File descriptor */
short events; /* Requested events bit mask */
short revents; /* Returned events bit mask */
};
epoll
區分水平觸發
和邊緣觸發
// 創建epoll示例:紅黑樹,就緒隊列,返回epoll實例的FD
int epoll_create(int size);
int epoll_create1(int flag); // 與epoll_create1類似,對epfd有一定的控制
// FD註冊
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *ev);
// epfd epoll_create返回的FD
// op 操作類型 EPOLL_CTL_ADD EPOLL_CTL_MOD EPOLL_CTL_DEL
// fd 操作對象FD
// ev 關注的事件類型(位掩碼)
struct epoll_event {
uint32_t events; /* epoll events (bit mask) */
epoll_data_t data; /* User data */
};
typedef union epoll_data {
void *ptr; /* Pointer to user-defined data */
int fd; /* File descriptor */
uint32_t u32; /* 32-bit integer */
uint64_t u64; /* 64-bit integer */
} epoll_data_t;
// 獲取就緒的FD
int epoll_wait(int epfd, struct epoll_event *evlist, int maxevents, int timeout);
// evlist,maxevents指定返回的event的存儲空間和數量
// 自動銷燬
epoll相對於其他I/O多路複用模型的優勢:
1.不需要每次調用傳遞FD(用戶態/內核態數據拷貝),epoll_ctl將FD添加到內核數據空間中;
2.epoll_wait的效率更高,不需要對比所有的FD,只需要從就緒隊列中獲取數據即可;
常見I/O模型:阻塞,非阻塞,I/O多路複用,信號驅動,異步I/O
分析實例:File.Read
在unix/linux平臺上,netpoller是基於epoll模型來實現的,一下分析也是限定於此;
以簡單的文件讀取(unix|linux平臺)爲例,分析從代碼層面開始是怎麼一步步使用netpoller的。
## 用戶代碼
func main() {
f, err := os.Open("test.txt")
if err != nil {
log.Fatalln(err)
}
buf := make([]byte, 10)
f.Read(buf)
log.Println(string(buf))
}
使用os.Open
創建一個File實例,核心是獲取到文件描述符(pfd)
type File struct {
*file // os specific
}
// unix平臺實現
// ## os/file.go
type file struct {
pfd poll.FD
name string
dirinfo *dirInfo // nil unless directory being read
nonblock bool // whether we set nonblocking mode
stdoutOrErr bool // whether this is stdout or stderr
}
/*os/file.go*/
func (f *File) Read(b []byte) (n int, err error) {
if err := f.checkValid("read"); err != nil {
return 0, err
}
n, e := f.read(b)
return n, f.wrapErr("read", e)
}
/*os/file_unix.go(unix平臺)*/
func (f *File) read(b []byte) (n int, err error) {
n, err = f.pfd.Read(b)
runtime.KeepAlive(f)
return n, err
}
調用Read
函數,實際上調用了底層的FD.Read
,其實現如下:
/*internal/poll/fd_unix.go*/
func (fd *FD) Read(p []byte) (int, error) {
if err := fd.readLock(); err != nil {
return 0, err
}
defer fd.readUnlock()
if len(p) == 0 {
// If the caller wanted a zero byte read, return immediately
// without trying (but after acquiring the readLock).
// Otherwise syscall.Read returns 0, nil which looks like
// io.EOF.
// TODO(bradfitz): make it wait for readability? (Issue 15735)
return 0, nil
}
if err := fd.pd.prepareRead(fd.isFile); err != nil {
return 0, err
}
if fd.IsStream && len(p) > maxRW {
p = p[:maxRW]
}
for {
// 系統調用 -- 讀取數據
n, err := syscall.Read(fd.Sysfd, p)
// 出現錯誤 -- 可能是一般錯誤也可能是數據未準備好的特殊錯誤(EAGAIN)
if err != nil {
n = 0
// syscall.EAGAIN錯誤 & 可以使用netpoller
// 比如:pollable := kind == kindOpenFile || kind == kindPipe || kind == kindNonBlock
// 那麼進入非阻塞的I/O流程
if err == syscall.EAGAIN && fd.pd.pollable() {
if err = fd.pd.waitRead(fd.isFile); err == nil {
continue
}
}
// On MacOS we can see EINTR here if the user
// pressed ^Z. See issue #22838.
if runtime.GOOS == "darwin" && err == syscall.EINTR {
continue
}
}
err = fd.eofError(n, err)
return n, err
}
}
/*internal/poll/fd_poll_runtime.go*/
func (pd *pollDesc) waitRead(isFile bool) error {
return pd.wait('r', isFile)
}
func (pd *pollDesc) wait(mode int, isFile bool) error {
if pd.runtimeCtx == 0 {
return errors.New("waiting for unsupported file type")
}
res := runtime_pollWait(pd.runtimeCtx, mode)
return convertErr(res, isFile)
}
/*runtime/netpoll.go*/
//go:linkname poll_runtime_pollWait internal/poll.runtime_pollWait
func poll_runtime_pollWait(pd *pollDesc, mode int) int {
err := netpollcheckerr(pd, int32(mode))
if err != 0 {
return err
}
// As for now only Solaris and AIX use level-triggered IO.
if GOOS == "solaris" || GOOS == "aix" {
netpollarm(pd, mode)
}
for !netpollblock(pd, int32(mode), false) {
err = netpollcheckerr(pd, int32(mode))
if err != 0 {
return err
}
// Can happen if timeout has fired and unblocked us,
// but before we had a chance to run, timeout has been reset.
// Pretend it has not happened and retry.
}
return 0
}
// 關鍵步驟
// 這是一個阻塞調用:如果IO準備好則退出,否則掛起
// returns true if IO is ready, or false if timedout or closed
// waitio - wait only for completed IO, ignore errors
func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
gpp := &pd.rg
if mode == 'w' {
gpp = &pd.wg
}
// 如果已經準備好,return true
// 如果沒有準備好,gopark
// set the gpp semaphore to WAIT
for {
old := *gpp
if old == pdReady {
*gpp = 0
return true
}
if old != 0 {
throw("runtime: double wait")
}
if atomic.Casuintptr(gpp, 0, pdWait) {
break
}
}
// waitio -- false
// 掛起當前goroutine
// need to recheck error states after setting gpp to WAIT
// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
if waitio || netpollcheckerr(pd, mode) == 0 {
gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
}
// 等待IO準備好,goroutine被喚醒,代碼從這裏恢復執行
// 狀態置爲pdReady
// 代碼退出到系統調用 -- 獲取數據
// be careful to not lose concurrent READY notification
old := atomic.Xchguintptr(gpp, 0)
if old > pdWait {
throw("runtime: corrupted polldesc")
}
return old == pdReady
}
以上的代碼,我們看到一個簡單的File.Read
是怎樣一步一步從將當前FD添加到epoll模型,並將當前G掛起;
那麼,進行更加深入的分析,我們就要知道一下幾個問題:
- Go是如何初始化epoll?(epoll_ctl)
- 文件FD是如何添加/刪除到epoll中?(epoll_ctl)
- Go中是怎麼獲取到IO準備就緒事件的?(epoll_wait)
- Go是怎麼喚醒對應被掛起的goroutine?(重點)
需要提前明確的是:Go和epoll實例交互依舊是通過三個固定函數進行的,以系統調用的方式實現;
1.初始化
/*internal/poll/fd_poll_runtime.go*/
// 被動初始化(once)
var serverInit sync.Once
func (pd *pollDesc) init(fd *FD) error {
serverInit.Do(runtime_pollServerInit)
ctx, errno := runtime_pollOpen(uintptr(fd.Sysfd))
if errno != 0 {
if ctx != 0 {
runtime_pollUnblock(ctx)
runtime_pollClose(ctx)
}
return syscall.Errno(errno)
}
pd.runtimeCtx = ctx
return nil
}
/*runtime/netpoll.go*/
//go:linkname poll_runtime_pollServerInit internal/poll.runtime_pollServerInit
func poll_runtime_pollServerInit() {
// 根據不同平臺,會有不同的netpoll實現
netpollinit()
atomic.Store(&netpollInited, 1)
}
/*runtime/netpoll_epoll.go*/
var (
// 全局的epoll文件描述符
epfd int32 = -1 // epoll descriptor
)
func netpollinit() {
// 先調用epollcreate1創建一個epoll實例,flag爲_EPOLL_CLOEXEC優化了epfd在競爭和跨線程的使用
epfd = epollcreate1(_EPOLL_CLOEXEC)
if epfd >= 0 {
return
}
// epollcreate1如果不成功,則使用epollcreate,size=1024(size參數在之後2.6.8之後已經被忽略)
epfd = epollcreate(1024)
if epfd >= 0 {
closeonexec(epfd)
return
}
println("runtime: epollcreate failed with", -epfd)
throw("runtime: netpollinit failed")
}
epoll_create1:https://linux.die.net/man/2/epoll_create1
在runtime層,如果有pollDesc被初始化則會被動的進行netpoller的初始化,然後調用平臺相關的netpollinit的實現;
首先epollcreate
和epollcreate
的實現在彙編中(runtime、sys_linux_amd64.s),我們可以忽略;
同時,在Go中有一個全局的epfd,所以可以說runtime只會創建一個epoll實例來管理所有的IO事件;
2.添加和刪除
netpoller初始化後,就涉及到如何向epoll中添加和刪除FD,當然我們知道底層肯定是通過系統調用epoll_ctl來實現的;
添加
/*runtime/netpoll.go*/
// 參考: func (pd *pollDesc) init(fd *FD) error
ctx, errno := runtime_pollOpen(uintptr(fd.Sysfd))
//go:linkname poll_runtime_pollOpen internal/poll.runtime_pollOpen
func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
// pollDesc鏈表單獨管理,不能被GC釋放,複用
pd := pollcache.alloc()
lock(&pd.lock)
if pd.wg != 0 && pd.wg != pdReady {
throw("runtime: blocked write on free polldesc")
}
if pd.rg != 0 && pd.rg != pdReady {
throw("runtime: blocked read on free polldesc")
}
// 初始化
pd.fd = fd
pd.closing = false
pd.rseq++
pd.rg = 0
pd.rd = 0
pd.wseq++
pd.wg = 0
pd.wd = 0
unlock(&pd.lock)
var errno int32
// 註冊到netpoller,調用平臺實現
errno = netpollopen(fd, pd)
return pd, int(errno)
}
/*runtime/netpoll_epoll.go*/
func netpollopen(fd uintptr, pd *pollDesc) int32 {
var ev epollevent
// 普通數據 | 邊緣觸發
// 參考下表
ev.events = _EPOLLIN | _EPOLLOUT | _EPOLLRDHUP | _EPOLLET
*(**pollDesc)(unsafe.Pointer(&ev.data)) = pd
return -epollctl(epfd, _EPOLL_CTL_ADD, int32(fd), &ev)
}
刪除
/*internal/poll/fd_poll_runtime.go*/
func (pd *pollDesc) close() {
if pd.runtimeCtx == 0 {
return
}
runtime_pollClose(pd.runtimeCtx)
pd.runtimeCtx = 0
}
/*runtime/netpoll.go*/
func poll_runtime_pollClose(pd *pollDesc) {
if !pd.closing {
throw("runtime: close polldesc w/o unblock")
}
if pd.wg != 0 && pd.wg != pdReady {
throw("runtime: blocked write on closing polldesc")
}
if pd.rg != 0 && pd.rg != pdReady {
throw("runtime: blocked read on closing polldesc")
}
// 從netpoller中刪除,調用平臺實現
netpollclose(pd.fd)
// 釋放pollDesc對象
pollcache.free(pd)
}
/*runtime/netpoll_epoll.go*/
func netpollclose(fd uintptr) int32 {
var ev epollevent
// 彙編實現,系統調用
return -epollctl(epfd, _EPOLL_CTL_DEL, int32(fd), &ev)
}
pollcache: 是通過鏈表接口實現的alloc對應LPOP(沒有就創建一個),free對應LPUSH,以達到複用pollDesc對象;
對netpoller中FD的操作僅發生在pollDesc初始化和關閉時,在epoll的實現中,通過epoll_ctl系統調用實現;
[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-lS9in7xt-1571323238183)( http://storage.aaronzz.xyz/docs/epoll_event_type.jpg )]
3.事件獲取
/*runtime/netpoll_epoll.go*/
// polls for ready network connections
// returns list of goroutines that become runnable
// block == true 一直阻塞 block == false 不阻塞
func netpoll(block bool) gList {
if epfd == -1 {
return gList{}
}
waitms := int32(-1)
if !block {
waitms = 0
}
// 單次最多返回128個事件
var events [128]epollevent
retry:
// 系統調用
// epfd 全局的epoll文件描述符
// events 返回的事件列表
n := epollwait(epfd, &events[0], int32(len(events)), waitms)
if n < 0 {
if n != -_EINTR {
println("runtime: epollwait on fd", epfd, "failed with", -n)
throw("runtime: netpoll failed")
}
goto retry
}
var toRun gList
for i := int32(0); i < n; i++ {
ev := &events[i]
if ev.events == 0 {
continue
}
var mode int32
// 讀就緒 -- 和添加FD時event類型對應
if ev.events&(_EPOLLIN|_EPOLLRDHUP|_EPOLLHUP|_EPOLLERR) != 0 {
mode += 'r'
}
// 寫就緒 -- 和添加FD時event類型對應
if ev.events&(_EPOLLOUT|_EPOLLHUP|_EPOLLERR) != 0 {
mode += 'w'
}
if mode != 0 {
pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
// 查找可喚醒的goroutines
netpollready(&toRun, pd, mode)
}
}
// epoll_wait在某些信號下也會返回
if block && toRun.empty() {
goto retry
}
return toRun
}
1.Go中獲取epoll時間只有阻塞和非阻塞兩種(非timeout);
2.events的最大長度爲128;
在非阻塞模式下,調用epoll_wait獲取關注的event,並喚醒對應的對應goroutine;需要注意的是EPOLLERR也會喚醒,爲了讓錯誤能傳遞出來;
在阻塞模式下,重複非阻塞模式下的流程,知道有goroutine被喚醒爲止;
runtime中通過調用netpoll來主動獲取已經就緒的epoll_event,那麼他的觸發時機常見的有一下幾個場景:
1.sysmon函數定時觸發
sysmon作爲調度器很重要的一環,會循環處理調度任務,包括調用netpoll函數;
if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
list := netpoll(false) // non-blocking - returns list of goroutines
if !list.empty() {
incidlelocked(-1)
injectglist(&list)
incidlelocked(1)
}
}
2.P查找可運行的G時
如果P在本地和全局隊列中沒有找到可用的G,會觸發netpoll;
func findrunnable() (gp *g, inheritTime bool)
if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
if list := netpoll(false); !list.empty() { // non-blocking
gp := list.pop()
injectglist(&list)
casgstatus(gp, _Gwaiting, _Grunnable)
if trace.enabled {
traceGoUnpark(gp, 0)
}
return gp, false
}
}
3.STW恢復時
STW期間可能已經有IO準備就緒,所以在STW結束後,會立即觸發netpoll;
func startTheWorldWithSema(emitTraceEvent bool) int64 {
_g_ := getg()
_g_.m.locks++ // disable preemption because it can be holding p in a local var
if netpollinited() {
list := netpoll(false) // non-blocking
injectglist(&list)
}
... ...
}
4.關聯被掛起的G
在netpoll
函數中會調用netpollready
來喚醒對應goroutine,那麼從epoll的event到goroutine它們是怎麼關聯起來的呢?
/*runtime/netpoll_epoll.go*/
func netpoll(block bool) gList {
...
if mode != 0 {
// epoll_event的用戶數據部分爲pollDesc
pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
netpollready(&toRun, pd, mode)
}
/*runtime/netpoll.go*/
func netpollready(toRun *gList, pd *pollDesc, mode int32) {
var rg, wg *g
if mode == 'r' || mode == 'r'+'w' {
rg = netpollunblock(pd, 'r', true)
}
if mode == 'w' || mode == 'r'+'w' {
wg = netpollunblock(pd, 'w', true)
}
if rg != nil {
toRun.push(rg)
}
if wg != nil {
toRun.push(wg)
}
}
// 查找可喚醒的G
func netpollunblock(pd *pollDesc, mode int32, ioready bool) *g {
gpp := &pd.rg
if mode == 'w' {
gpp = &pd.wg
}
for {
// old保存了關聯的G
old := *gpp
if old == pdReady {
return nil
}
if old == 0 && !ioready {
// Only set READY for ioready. runtime_pollWait
// will check for timeout/cancel before waiting.
return nil
}
var new uintptr
if ioready {
new = pdReady
}
// 將pdRead保存在rg/wg
if atomic.Casuintptr(gpp, old, new) {
if old == pdReady || old == pdWait {
old = 0
}
// 返回關聯的G
return (*g)(unsafe.Pointer(old))
}
}
}
到此,會有一個疑問爲什麼wg
和rg
分別保存的關聯的G,因爲從之前的代碼中我們看到這個字段也有存pdWait的狀態?那麼,我們來梳理一下這兩個字段在整個流程中的變化;
## 1.pollDesc初始化 wg,rg == 0
/*runtime/netpoll.go*/
// 參考: func (pd *pollDesc) init(fd *FD) error
ctx, errno := runtime_pollOpen(uintptr(fd.Sysfd))
// 參考 func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int)
pd.rg = 0
pd.wg = 0
## 2.讀取數據得到EAGAIN,進行wait wg,rg --> pdWait --> G
/*runtime/netpoll.go*/
// 參考 func netpollblock(pd *pollDesc, mode int32, waitio bool) bool
for {
old := *gpp
if old == pdReady {
*gpp = 0
return true
}
if old != 0 {
throw("runtime: double wait")
}
// 更新爲pdWait狀態
if atomic.Casuintptr(gpp, 0, pdWait) {
break
}
}
if waitio || netpollcheckerr(pd, mode) == 0 {
// 掛起
// lock參數正好爲rg或者wg的地址
gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
}
// gopark定義
// unlockf爲回調函數,傳入參數爲lock
func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceEv byte, traceskip int)
// 進過gopark操作後回調
// gp當前goroutine
// gpp rg或者wg的地址
func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
// 關鍵步驟:將當前G保存到rg或者wg中
r := atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
if r {
// Bump the count of goroutines waiting for the poller.
// The scheduler uses this to decide whether to block
// waiting for the poller if there is nothing else to do.
atomic.Xadd(&netpollWaiters, 1)
}
return r
}
## 3.之後爲等待IO就緒,進入G的喚醒流程
所以,runtime之所以可以快速找到對應的G,有一下幾個關鍵步驟:
- 等待IO就緒,掛起當前G之後將當前G保存到rg或者wg;
- 添加FD到epoll實例中,用戶自定義數據部分爲整個epollDesc;
- IO就緒,返回的epoll_event信息包含epollDesc,同時也就能快速找到對應的rg或者wg;
總結
基於netpoller,Go語言讓程序員可以用阻塞的思想來編寫IO操作的代碼,但是底層卻自動實現了多路複用的機制;
G對象直接和FD關聯,且在整個流程都攜帶了G的地址信息,可以快速查找並喚醒響應的G;
同時,由於goroutine相較於線程有天生的優勢,調度開銷小;