Linux Kernel2.6.9內核源碼分析--select

Linux Kernel2.6.9內核源碼分析–select

需要解決的問題:
通過追蹤內核源碼,查看內核是如何實現select監聽的功能

首先來看下select API的定義和參數:
int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
參數說明:
int nfds:是一個整數值, 表示集合中所有文件描述符的範圍,即所有文件描述符的最大值+1
在後面的代碼中可以看到Linux 內核的實現方式是從0 ~~ nfds作爲下標在進程描述符中的files數組中,依次監聽各個文件描述
符有沒有事件上報
fd_set *readfds, *writefds,*exceptfds:分別代表監聽讀/寫/錯誤的文件描述符集,實際上是一個long型的數組.當select返回後,內核會修改集合中的值從而集合中的值不再代表原始的文件描述符,因此每次調用select前都需要重新初始化這些文件描述符集.如果某個文件描述上有事件發生,則將對應fds中的值設置爲1,沒有時間發生就設置爲0。如readfds集合中第二個文件有讀時間發生,則該數組中第二個值變成了1,不再是原始的文件描述符的值.
struct timeval *timeout:超時時間,超過這個時間,無論有沒有監聽到事件,則不再阻塞都立刻返回.
返回值:返回執行錯誤代碼

再來看下select API對應的系統調用:
long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
1.查看參數n是否超過了最大值max_fdset,超過了n就等於max_fdset
2.從slab中分配6 x n x sizeof(long)空間,分別爲:fds.in ,fds.out, fds.ex, fds.res_in fds.res_out,fds.res_ex,然後將user space的參數inp, outp, exp copy賦值分別賦值給fds.in ,fds.out, fds.ex
3.調用do_select,在該函數中會sleep直到timeout //核心函數
4.將fds.res_in fds.res_out,fds.res_ex 賦值給fds.in ,fds.out, fds.ex,從而用戶就知道哪個文件描述符有事件發生.

long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
    ...............
	/* max_fdset can increase, so grab it once to avoid race */
	max_fdset = current->files->max_fdset;
	if (n > max_fdset)
		n = max_fdset;
	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	ret = -ENOMEM;
	size = FDS_BYTES(n);
	bits = select_bits_alloc(size);
	if (!bits)
		goto out_nofds;
	fds.in      = (unsigned long *)  bits;
	fds.out     = (unsigned long *) (bits +   size);
	fds.ex      = (unsigned long *) (bits + 2*size);
	fds.res_in  = (unsigned long *) (bits + 3*size);
	fds.res_out = (unsigned long *) (bits + 4*size);
	fds.res_ex  = (unsigned long *) (bits + 5*size);

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, &timeout);
    ...........
	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;
out:
	select_bits_free(bits, size);
out_nofds:
	return ret;
}

再來看 int do_select(int n, fd_set_bits *fds, long *timeout)

  1. void poll_initwait(struct poll_wqueues *pwq) 是將struct poll_wqueues table變量進行初始化:
    struct poll_wqueues {
    poll_table pt;
    struct poll_table_page * table;
    int error;
    };
    其中pt 是一個函數指針:typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
    table.pt = __pollwait 函數,後面再來分析這個函數
    table.table = NULL,
    table.error = 0
  2. 大循環依次遍歷每個文件描述符,調用file->f_op->poll,並只傳入table.pt 一次(不是很理解!!!!)
    在前一邊博客eventpoll中有解釋到,以本地socket爲例,file->f_op->poll,最終會調用到unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait),其會先調用
    static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
    {
    if (p && wait_address)
    p->qproc(filp, wait_address, p);
    }
    也就是前面table.pt 即__pollwait 函數
    再來看下__pollwait函數:
    void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *_p)
    {

    {
    struct poll_table_entry * entry = table->entry;
    table->entry = entry+1;
    get_file(filp);
    entry->filp = filp;
    entry->wait_address = wait_address;
    init_waitqueue_entry(&entry->wait, current);
    add_wait_queue(wait_address,&entry->wait);
    }
    }
    也就是將當前file關聯的等待隊列加入到sk->sk_sleep鏈表,再socket狀態變化時,執行等待隊列的回調函數,喚醒等待的進程.
  3. 調用file->f_op->poll返回後,如果有event,則將對應的res_in/res_out/res_ex設定爲1.
    static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
    {
    	struct sock *sk = sock->sk;
    	unsigned int mask;
    
    	poll_wait(file, sk->sk_sleep, wait);
    	mask = 0;
    
    	/* exceptional events? */
    	if (sk->sk_err)
    		mask |= POLLERR;
    	if (sk->sk_shutdown == SHUTDOWN_MASK)
    		mask |= POLLHUP;
    
    	/* readable? */
    	if (!skb_queue_empty(&sk->sk_receive_queue) ||
    	    (sk->sk_shutdown & RCV_SHUTDOWN))
    		mask |= POLLIN | POLLRDNORM;
    
    	/* Connection-based need to check for termination and startup */
    	if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
    		mask |= POLLHUP;
    
    	/*
    	 * we set writable also when the other side has shut down the
    	 * connection. This prevents stuck sockets.
    	 */
    	if (unix_writable(sk))
    		mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
    
    	return mask;
    }
    
int do_select(int n, fd_set_bits *fds, long *timeout)
{
    .............
	poll_initwait(&table); // -------> step 1
	wait = &table.pt;
	if (!__timeout)
		wait = NULL;
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
		set_current_state(TASK_INTERRUPTIBLE);
		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
			struct file_operations *f_op = NULL;
			struct file *file = NULL;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
				i += __NFDBITS;
				continue;
			}

			for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
				file = fget(i);
				if (file) {
					f_op = file->f_op;
					mask = DEFAULT_POLLMASK;
					if (f_op && f_op->poll)
						mask = (*f_op->poll)(file, retval ? NULL : wait);  // -------> 爲何retval > 0 就不傳入wait了 ??????
					fput(file);
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
					}
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
		}
		wait = NULL;
		if (retval || !__timeout || signal_pending(current))  // -------> step 3
			break;
		if(table.error) {
			retval = table.error;
			break;
		}
		__timeout = schedule_timeout(__timeout);
	}
	__set_current_state(TASK_RUNNING);
	poll_freewait(&table);
	*timeout = __timeout;
	return retval;
}
發佈了26 篇原創文章 · 獲贊 0 · 訪問量 2850
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章