Golang代碼筆記--netpoller

netpoller

背景介紹

I/O多路複用模型(I/O Multiplexing):

select

阻塞,直到有FD準備好,FD數量有FD_SETSIZE限制

int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds,
 struct timeval *timeout);

// readfds,writefds,exceptfds 需要檢查的FDs
// nfds 比最大的FD大1(減少內核比較次數)
// timeout 最大等待時間
// fd_set 位掩碼錶示FD集合

poll

和select類似,在傳遞FD方式上不同

int poll(struct pollfd fds[], nfds_t nfds, int timeout);

// fds 需要檢查的FDs和事件(events,revents),結果寫入fds
// nfds fds中的FD數量
// timeout 最大等待時間

struct pollfd {
 int fd; /* File descriptor */
 short events; /* Requested events bit mask */
 short revents; /* Returned events bit mask */
};

epoll

區分水平觸發邊緣觸發

// 創建epoll示例:紅黑樹,就緒隊列,返回epoll實例的FD
int epoll_create(int size);
int epoll_create1(int flag); // 與epoll_create1類似,對epfd有一定的控制

// FD註冊
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *ev);
// epfd epoll_create返回的FD
// op 操作類型 EPOLL_CTL_ADD EPOLL_CTL_MOD EPOLL_CTL_DEL
// fd 操作對象FD
// ev 關注的事件類型(位掩碼)

struct epoll_event {
 uint32_t events; /* epoll events (bit mask) */
 epoll_data_t data; /* User data */
};

typedef union epoll_data {
 void *ptr; /* Pointer to user-defined data */
 int fd; /* File descriptor */
 uint32_t u32; /* 32-bit integer */
 uint64_t u64; /* 64-bit integer */
} epoll_data_t;

// 獲取就緒的FD
int epoll_wait(int epfd, struct epoll_event *evlist, int maxevents, int timeout);
// evlist,maxevents指定返回的event的存儲空間和數量

// 自動銷燬

epoll相對於其他I/O多路複用模型的優勢:

1.不需要每次調用傳遞FD(用戶態/內核態數據拷貝),epoll_ctl將FD添加到內核數據空間中;

2.epoll_wait的效率更高,不需要對比所有的FD,只需要從就緒隊列中獲取數據即可;

常見I/O模型:阻塞,非阻塞,I/O多路複用,信號驅動,異步I/O

分析實例:File.Read

在unix/linux平臺上,netpoller是基於epoll模型來實現的,一下分析也是限定於此;

以簡單的文件讀取(unix|linux平臺)爲例,分析從代碼層面開始是怎麼一步步使用netpoller的。

## 用戶代碼
func main() {
	f, err := os.Open("test.txt")
	if err != nil {
		log.Fatalln(err)
	}
	buf := make([]byte, 10)
	f.Read(buf)
	log.Println(string(buf))
}

使用os.Open創建一個File實例,核心是獲取到文件描述符(pfd)

type File struct {
	*file // os specific
}

// unix平臺實現
// ## os/file.go
type file struct {
	pfd         poll.FD
	name        string
	dirinfo     *dirInfo // nil unless directory being read
	nonblock    bool     // whether we set nonblocking mode
	stdoutOrErr bool     // whether this is stdout or stderr
}

/*os/file.go*/
func (f *File) Read(b []byte) (n int, err error) {
	if err := f.checkValid("read"); err != nil {
		return 0, err
	}
	n, e := f.read(b)
	return n, f.wrapErr("read", e)
}
/*os/file_unix.go(unix平臺)*/
func (f *File) read(b []byte) (n int, err error) {
	n, err = f.pfd.Read(b)
	runtime.KeepAlive(f)
	return n, err
}

調用Read函數,實際上調用了底層的FD.Read,其實現如下:

/*internal/poll/fd_unix.go*/
func (fd *FD) Read(p []byte) (int, error) {
	if err := fd.readLock(); err != nil {
		return 0, err
	}
	defer fd.readUnlock()
	if len(p) == 0 {
		// If the caller wanted a zero byte read, return immediately
		// without trying (but after acquiring the readLock).
		// Otherwise syscall.Read returns 0, nil which looks like
		// io.EOF.
		// TODO(bradfitz): make it wait for readability? (Issue 15735)
		return 0, nil
	}
	if err := fd.pd.prepareRead(fd.isFile); err != nil {
		return 0, err
	}
	if fd.IsStream && len(p) > maxRW {
		p = p[:maxRW]
	}
	for {
        // 系統調用 -- 讀取數據
		n, err := syscall.Read(fd.Sysfd, p)
        // 出現錯誤 -- 可能是一般錯誤也可能是數據未準備好的特殊錯誤(EAGAIN)
		if err != nil {
			n = 0
            // syscall.EAGAIN錯誤 & 可以使用netpoller
            // 比如:pollable := kind == kindOpenFile || kind == kindPipe || kind == kindNonBlock
            // 那麼進入非阻塞的I/O流程
			if err == syscall.EAGAIN && fd.pd.pollable() {
				if err = fd.pd.waitRead(fd.isFile); err == nil {
					continue
				}
			}

			// On MacOS we can see EINTR here if the user
			// pressed ^Z.  See issue #22838.
			if runtime.GOOS == "darwin" && err == syscall.EINTR {
				continue
			}
		}
		err = fd.eofError(n, err)
		return n, err
	}
}
/*internal/poll/fd_poll_runtime.go*/
func (pd *pollDesc) waitRead(isFile bool) error {
	return pd.wait('r', isFile)
}

func (pd *pollDesc) wait(mode int, isFile bool) error {
	if pd.runtimeCtx == 0 {
		return errors.New("waiting for unsupported file type")
	}
	res := runtime_pollWait(pd.runtimeCtx, mode)
	return convertErr(res, isFile)
}
/*runtime/netpoll.go*/
//go:linkname poll_runtime_pollWait internal/poll.runtime_pollWait
func poll_runtime_pollWait(pd *pollDesc, mode int) int {
	err := netpollcheckerr(pd, int32(mode))
	if err != 0 {
		return err
	}
	// As for now only Solaris and AIX use level-triggered IO.
	if GOOS == "solaris" || GOOS == "aix" {
		netpollarm(pd, mode)
	}
	for !netpollblock(pd, int32(mode), false) {
		err = netpollcheckerr(pd, int32(mode))
		if err != 0 {
			return err
		}
		// Can happen if timeout has fired and unblocked us,
		// but before we had a chance to run, timeout has been reset.
		// Pretend it has not happened and retry.
	}
	return 0
}

// 關鍵步驟
// 這是一個阻塞調用:如果IO準備好則退出,否則掛起
// returns true if IO is ready, or false if timedout or closed
// waitio - wait only for completed IO, ignore errors
func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
	gpp := &pd.rg
	if mode == 'w' {
		gpp = &pd.wg
	}
	
    // 如果已經準備好,return true
    // 如果沒有準備好,gopark
	// set the gpp semaphore to WAIT
	for {
		old := *gpp
		if old == pdReady {
			*gpp = 0
			return true
		}
		if old != 0 {
			throw("runtime: double wait")
		}
		if atomic.Casuintptr(gpp, 0, pdWait) {
			break
		}
	}

    // waitio -- false
    // 掛起當前goroutine
	// need to recheck error states after setting gpp to WAIT
	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
	// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
	if waitio || netpollcheckerr(pd, mode) == 0 {
		gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
	}
    // 等待IO準備好,goroutine被喚醒,代碼從這裏恢復執行
    // 狀態置爲pdReady
    // 代碼退出到系統調用 -- 獲取數據
	// be careful to not lose concurrent READY notification
	old := atomic.Xchguintptr(gpp, 0)
	if old > pdWait {
		throw("runtime: corrupted polldesc")
	}
	return old == pdReady
}

以上的代碼,我們看到一個簡單的File.Read是怎樣一步一步從將當前FD添加到epoll模型,並將當前G掛起;

那麼,進行更加深入的分析,我們就要知道一下幾個問題:

  1. Go是如何初始化epoll?(epoll_ctl)
  2. 文件FD是如何添加/刪除到epoll中?(epoll_ctl)
  3. Go中是怎麼獲取到IO準備就緒事件的?(epoll_wait)
  4. Go是怎麼喚醒對應被掛起的goroutine?(重點)

需要提前明確的是:Go和epoll實例交互依舊是通過三個固定函數進行的,以系統調用的方式實現;

1.初始化

/*internal/poll/fd_poll_runtime.go*/

// 被動初始化(once)
var serverInit sync.Once

func (pd *pollDesc) init(fd *FD) error {
	serverInit.Do(runtime_pollServerInit)
	ctx, errno := runtime_pollOpen(uintptr(fd.Sysfd))
	if errno != 0 {
		if ctx != 0 {
			runtime_pollUnblock(ctx)
			runtime_pollClose(ctx)
		}
		return syscall.Errno(errno)
	}
	pd.runtimeCtx = ctx
	return nil
}

/*runtime/netpoll.go*/

//go:linkname poll_runtime_pollServerInit internal/poll.runtime_pollServerInit
func poll_runtime_pollServerInit() {
    // 根據不同平臺,會有不同的netpoll實現
	netpollinit()
	atomic.Store(&netpollInited, 1)
}
/*runtime/netpoll_epoll.go*/

var (
    // 全局的epoll文件描述符
	epfd int32 = -1 // epoll descriptor
)

func netpollinit() {
    // 先調用epollcreate1創建一個epoll實例,flag爲_EPOLL_CLOEXEC優化了epfd在競爭和跨線程的使用
	epfd = epollcreate1(_EPOLL_CLOEXEC)
	if epfd >= 0 {
		return
	}
    // epollcreate1如果不成功,則使用epollcreate,size=1024(size參數在之後2.6.8之後已經被忽略)
	epfd = epollcreate(1024)
	if epfd >= 0 {
		closeonexec(epfd)
		return
	}
	println("runtime: epollcreate failed with", -epfd)
	throw("runtime: netpollinit failed")
}

epoll_create1:https://linux.die.net/man/2/epoll_create1

在runtime層,如果有pollDesc被初始化則會被動的進行netpoller的初始化,然後調用平臺相關的netpollinit的實現;

首先epollcreateepollcreate的實現在彙編中(runtime、sys_linux_amd64.s),我們可以忽略;

同時,在Go中有一個全局的epfd,所以可以說runtime只會創建一個epoll實例來管理所有的IO事件;

2.添加和刪除

netpoller初始化後,就涉及到如何向epoll中添加和刪除FD,當然我們知道底層肯定是通過系統調用epoll_ctl來實現的;

添加

/*runtime/netpoll.go*/
// 參考: func (pd *pollDesc) init(fd *FD) error
ctx, errno := runtime_pollOpen(uintptr(fd.Sysfd))

//go:linkname poll_runtime_pollOpen internal/poll.runtime_pollOpen
func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
    // pollDesc鏈表單獨管理,不能被GC釋放,複用
	pd := pollcache.alloc()
	lock(&pd.lock)
	if pd.wg != 0 && pd.wg != pdReady {
		throw("runtime: blocked write on free polldesc")
	}
	if pd.rg != 0 && pd.rg != pdReady {
		throw("runtime: blocked read on free polldesc")
	}
    // 初始化
	pd.fd = fd
	pd.closing = false
	pd.rseq++
	pd.rg = 0
	pd.rd = 0
	pd.wseq++
	pd.wg = 0
	pd.wd = 0
	unlock(&pd.lock)

	var errno int32
    //  註冊到netpoller,調用平臺實現
	errno = netpollopen(fd, pd)
	return pd, int(errno)
}
/*runtime/netpoll_epoll.go*/

func netpollopen(fd uintptr, pd *pollDesc) int32 {
	var ev epollevent
    // 普通數據 | 邊緣觸發 
    // 參考下表
	ev.events = _EPOLLIN | _EPOLLOUT | _EPOLLRDHUP | _EPOLLET
	*(**pollDesc)(unsafe.Pointer(&ev.data)) = pd
	return -epollctl(epfd, _EPOLL_CTL_ADD, int32(fd), &ev)
}

刪除

/*internal/poll/fd_poll_runtime.go*/
func (pd *pollDesc) close() {
	if pd.runtimeCtx == 0 {
		return
	}
	runtime_pollClose(pd.runtimeCtx)
	pd.runtimeCtx = 0
}

/*runtime/netpoll.go*/
func poll_runtime_pollClose(pd *pollDesc) {
	if !pd.closing {
		throw("runtime: close polldesc w/o unblock")
	}
	if pd.wg != 0 && pd.wg != pdReady {
		throw("runtime: blocked write on closing polldesc")
	}
	if pd.rg != 0 && pd.rg != pdReady {
		throw("runtime: blocked read on closing polldesc")
	}
    // 從netpoller中刪除,調用平臺實現
	netpollclose(pd.fd)
    // 釋放pollDesc對象
	pollcache.free(pd)
}
/*runtime/netpoll_epoll.go*/

func netpollclose(fd uintptr) int32 {
	var ev epollevent
    // 彙編實現,系統調用
	return -epollctl(epfd, _EPOLL_CTL_DEL, int32(fd), &ev)
}

pollcache: 是通過鏈表接口實現的alloc對應LPOP(沒有就創建一個),free對應LPUSH,以達到複用pollDesc對象;

對netpoller中FD的操作僅發生在pollDesc初始化和關閉時,在epoll的實現中,通過epoll_ctl系統調用實現;

[外鏈圖片轉存失敗,源站可能有防盜鏈機制,建議將圖片保存下來直接上傳(img-lS9in7xt-1571323238183)( http://storage.aaronzz.xyz/docs/epoll_event_type.jpg )]

3.事件獲取

/*runtime/netpoll_epoll.go*/

// polls for ready network connections
// returns list of goroutines that become runnable
// block == true 一直阻塞 block == false 不阻塞
func netpoll(block bool) gList {
	if epfd == -1 {
		return gList{}
	}
	waitms := int32(-1)
	if !block {
		waitms = 0
	}
    // 單次最多返回128個事件
	var events [128]epollevent
retry:
    // 系統調用
    // epfd 全局的epoll文件描述符
    // events 返回的事件列表
	n := epollwait(epfd, &events[0], int32(len(events)), waitms)
	if n < 0 {
		if n != -_EINTR {
			println("runtime: epollwait on fd", epfd, "failed with", -n)
			throw("runtime: netpoll failed")
		}
		goto retry
	}
	var toRun gList
	for i := int32(0); i < n; i++ {
		ev := &events[i]
		if ev.events == 0 {
			continue
		}
		var mode int32
        // 讀就緒 -- 和添加FD時event類型對應
		if ev.events&(_EPOLLIN|_EPOLLRDHUP|_EPOLLHUP|_EPOLLERR) != 0 {
			mode += 'r'
		}
        // 寫就緒 -- 和添加FD時event類型對應
		if ev.events&(_EPOLLOUT|_EPOLLHUP|_EPOLLERR) != 0 {
			mode += 'w'
		}
		if mode != 0 {
			pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
			// 查找可喚醒的goroutines
			netpollready(&toRun, pd, mode)
		}
	}
    // epoll_wait在某些信號下也會返回
	if block && toRun.empty() {
		goto retry
	}
	return toRun
}

1.Go中獲取epoll時間只有阻塞和非阻塞兩種(非timeout);

2.events的最大長度爲128;

在非阻塞模式下,調用epoll_wait獲取關注的event,並喚醒對應的對應goroutine;需要注意的是EPOLLERR也會喚醒,爲了讓錯誤能傳遞出來;

在阻塞模式下,重複非阻塞模式下的流程,知道有goroutine被喚醒爲止;

runtime中通過調用netpoll來主動獲取已經就緒的epoll_event,那麼他的觸發時機常見的有一下幾個場景:

1.sysmon函數定時觸發

sysmon作爲調度器很重要的一環,會循環處理調度任務,包括調用netpoll函數;

if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
    atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
    list := netpoll(false) // non-blocking - returns list of goroutines
    if !list.empty() {
        incidlelocked(-1)
        injectglist(&list)
        incidlelocked(1)
    }
}

2.P查找可運行的G時

如果P在本地和全局隊列中沒有找到可用的G,會觸發netpoll;

func findrunnable() (gp *g, inheritTime bool)

if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
    if list := netpoll(false); !list.empty() { // non-blocking
        gp := list.pop()
        injectglist(&list)
        casgstatus(gp, _Gwaiting, _Grunnable)
        if trace.enabled {
            traceGoUnpark(gp, 0)
        }
        return gp, false
    }
}

3.STW恢復時

STW期間可能已經有IO準備就緒,所以在STW結束後,會立即觸發netpoll;

func startTheWorldWithSema(emitTraceEvent bool) int64 {
	_g_ := getg()

	_g_.m.locks++ // disable preemption because it can be holding p in a local var
	if netpollinited() {
		list := netpoll(false) // non-blocking
		injectglist(&list)
	}
	... ...
}

4.關聯被掛起的G

netpoll函數中會調用netpollready來喚醒對應goroutine,那麼從epoll的event到goroutine它們是怎麼關聯起來的呢?

/*runtime/netpoll_epoll.go*/
func netpoll(block bool) gList {
   ...
    if mode != 0 {
        // epoll_event的用戶數據部分爲pollDesc
        pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
        netpollready(&toRun, pd, mode)
    }
/*runtime/netpoll.go*/

func netpollready(toRun *gList, pd *pollDesc, mode int32) {
	var rg, wg *g
	if mode == 'r' || mode == 'r'+'w' {
		rg = netpollunblock(pd, 'r', true)
	}
	if mode == 'w' || mode == 'r'+'w' {
		wg = netpollunblock(pd, 'w', true)
	}
	if rg != nil {
		toRun.push(rg)
	}
	if wg != nil {
		toRun.push(wg)
	}
}

// 查找可喚醒的G
func netpollunblock(pd *pollDesc, mode int32, ioready bool) *g {
	gpp := &pd.rg
	if mode == 'w' {
		gpp = &pd.wg
	}

	for {
        // old保存了關聯的G
		old := *gpp
		if old == pdReady {
			return nil
		}
		if old == 0 && !ioready {
			// Only set READY for ioready. runtime_pollWait
			// will check for timeout/cancel before waiting.
			return nil
		}
		var new uintptr
		if ioready {
			new = pdReady
		}
        // 將pdRead保存在rg/wg
		if atomic.Casuintptr(gpp, old, new) {
			if old == pdReady || old == pdWait {
				old = 0
			}
            // 返回關聯的G
			return (*g)(unsafe.Pointer(old))
		}
	}
}

到此,會有一個疑問爲什麼wgrg分別保存的關聯的G,因爲從之前的代碼中我們看到這個字段也有存pdWait的狀態?那麼,我們來梳理一下這兩個字段在整個流程中的變化;

## 1.pollDesc初始化 wg,rg == 0
/*runtime/netpoll.go*/
// 參考: func (pd *pollDesc) init(fd *FD) error
ctx, errno := runtime_pollOpen(uintptr(fd.Sysfd))
// 參考 func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int)
pd.rg = 0
pd.wg = 0

## 2.讀取數據得到EAGAIN,進行wait wg,rg --> pdWait --> G
/*runtime/netpoll.go*/
// 參考 func netpollblock(pd *pollDesc, mode int32, waitio bool) bool
for {
		old := *gpp
		if old == pdReady {
			*gpp = 0
			return true
		}
		if old != 0 {
			throw("runtime: double wait")
		}
    	// 更新爲pdWait狀態
		if atomic.Casuintptr(gpp, 0, pdWait) {
			break
		}
	}
	if waitio || netpollcheckerr(pd, mode) == 0 {
        // 掛起
        // lock參數正好爲rg或者wg的地址
		gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
	}

// gopark定義 
// unlockf爲回調函數,傳入參數爲lock
func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason waitReason, traceEv byte, traceskip int)

// 進過gopark操作後回調
// gp當前goroutine
// gpp rg或者wg的地址
func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
    // 關鍵步驟:將當前G保存到rg或者wg中
	r := atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
	if r {
		// Bump the count of goroutines waiting for the poller.
		// The scheduler uses this to decide whether to block
		// waiting for the poller if there is nothing else to do.
		atomic.Xadd(&netpollWaiters, 1)
	}
	return r
}

## 3.之後爲等待IO就緒,進入G的喚醒流程

所以,runtime之所以可以快速找到對應的G,有一下幾個關鍵步驟:

  1. 等待IO就緒,掛起當前G之後將當前G保存到rg或者wg;
  2. 添加FD到epoll實例中,用戶自定義數據部分爲整個epollDesc;
  3. IO就緒,返回的epoll_event信息包含epollDesc,同時也就能快速找到對應的rg或者wg;

總結

基於netpoller,Go語言讓程序員可以用阻塞的思想來編寫IO操作的代碼,但是底層卻自動實現了多路複用的機制;

G對象直接和FD關聯,且在整個流程都攜帶了G的地址信息,可以快速查找並喚醒響應的G;

同時,由於goroutine相較於線程有天生的優勢,調度開銷小;

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章