【Linux 內核網絡協議棧源碼剖析】accept 函數剖析

好,tcp 協議建立連接的幾個函數到這,就還差個accept 函數,這裏直接貼代碼了,不再向前面那般贅述了。

一、應用層——accept 函數

該函數返回一個已建立連接的可用於數據通信的套接字。

#include <sys/socket.h>
int accept(int sockfd, struct sockaddr *cliaddr, socklen_t *addrlen);
//返回:非負描述子——成功,-1——出錯
/*參數sockfd是監聽後的套接字,這個套接字用來監聽一個端口,當有一個客戶與服務器連接時,它使用一個與這個套接字關聯的端口號,
比較特別的是:參數cliaddr和addrlen是一個結果參數,用來返回已連接客戶的協議地址。如果對客戶的地址不感興趣,那麼可以把這個值設置爲NULL*/
二、BSD Socket 層——sock_accept 函數

/*
 *	For accept, we attempt to create a new socket, set up the link
 *	with the client, wake up the client, then return the new
 *	connected fd. We collect the address of the connector in kernel
 *	space and move it to user at the very end. This is buggy because
 *	we open the socket then return an error.
 */
//用於服務器接收一個客戶端的連接請求,這裏是值-結果參數,之前有說到
//fd 爲監聽後套接字。最後返回一個記錄了本地與目的端信息的套接字
//upeer_sockaddr用來返回已連接客戶的協議地址,如果對協議地址不感興趣就NULL
static int sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen)
{
	struct file *file;
	struct socket *sock, *newsock;
	int i;
	char address[MAX_SOCK_ADDR];
	int len;

	if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
		return(-EBADF);
  	if (!(sock = sockfd_lookup(fd, &file))) 
		return(-ENOTSOCK);
	if (sock->state != SS_UNCONNECTED)//socket各個狀態的演變是一步一步來的 
	{
		return(-EINVAL);
	}
	//這是tcp連接,得按步驟來
	if (!(sock->flags & SO_ACCEPTCON))//沒有listen
	{
		return(-EINVAL);
	}
	//分配一個新的套接字,用於表示後面可進行通信的套接字
	if (!(newsock = sock_alloc())) 
	{
		printk("NET: sock_accept: no more sockets\n");
		return(-ENOSR);	/* Was: EAGAIN, but we are out of system
				   resources! */
	}
	newsock->type = sock->type;
	newsock->ops = sock->ops;
	//套接字重定向,目的是初始化新的用於數據傳送的套接字
	//繼承了第一參數傳來的服務器的IP和端口號信息
	if ((i = sock->ops->dup(newsock, sock)) < 0) 
	{
		sock_release(newsock);
		return(i);
	}
    //轉調用inet_accept
	i = newsock->ops->accept(sock, newsock, file->f_flags);
	if ( i < 0) 
	{
		sock_release(newsock);
		return(i);
	}
    //分配一個文件描述符,用於以後的數據傳送
	if ((fd = get_fd(SOCK_INODE(newsock))) < 0) 
	{
		sock_release(newsock);
		return(-EINVAL);
	}
    //返回通信遠端的地址
	if (upeer_sockaddr)
	{//得到客戶端地址,並複製到用戶空間
		newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 1);
		move_addr_to_user(address,len, upeer_sockaddr, upeer_addrlen);
	}
	return(fd);
}
三、INET Socket 層——inet_accept 函數

/*
 *	Accept a pending connection. The TCP layer now gives BSD semantics.
 */
//先去看看sock_accept,看看各個參數的意思,newsock是dup sock後的新sock
//sock爲監聽套接字,newsock爲連接成功後實際用於通信的sock
static int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
	struct sock *sk1, *sk2;
	int err;

	sk1 = (struct sock *) sock->data;

	/*
	 * We've been passed an extra socket.
	 * We need to free it up because the tcp module creates
	 * its own when it accepts one.
	 */
	 //如果sock->data 已經指向了對應的sock結構,則把它銷燬
	 //銷燬舊的,後面指向新的accept後的
	if (newsock->data)
	{
	  	struct sock *sk=(struct sock *)newsock->data;
	  	newsock->data=NULL;
	  	sk->dead = 1;
	  	destroy_sock(sk);//銷燬舊的socket對應的sock結構
	}
  
	if (sk1->prot->accept == NULL) //沒有對應的操作函數集,退出
		return(-EOPNOTSUPP);

	/* Restore the state if we have been interrupted, and then returned. */
//如果套接字在等待連接的過程中被中斷,則監聽套接字與中斷的套接字關聯,下次優先處理該套接字
	if (sk1->pair != NULL ) 
	{
		sk2 = sk1->pair;
		sk1->pair = NULL;
	} 
	else
	{
//這裏調用下層處理函數tcp_accept,首次調用inet_accept,sk1->pair 肯定是爲NULL的,所以一開始就會執行下面的代碼
		sk2 = sk1->prot->accept(sk1,flags);//交給下層處理函數
		if (sk2 == NULL) 
		{
			if (sk1->err <= 0)
				printk("Warning sock.c:sk1->err <= 0.  Returning non-error.\n");
			err=sk1->err;
			sk1->err=0;
			return(-err);
		}
	}
	//socket sock建立關聯
	newsock->data = (void *)sk2;//指向新的,sk2爲下層函數tcp_accept返回的套接字
	sk2->sleep = newsock->wait;//等待隊列
	sk2->socket = newsock;//回綁,指向上層的socket結構
	newsock->conn = NULL;//還沒有連接客戶端
	if (flags & O_NONBLOCK) 
		return(0);

	cli(); /* avoid the race. */
	//三次握手中間過程,tcp SYN序列號接收
	while(sk2->state == TCP_SYN_RECV) 
	{
	//被中斷了
		interruptible_sleep_on(sk2->sleep);
		if (current->signal & ~current->blocked) 
		{
			sti();
			sk1->pair = sk2;//存入pair,下次優先處理
			sk2->sleep = NULL;
			sk2->socket=NULL;
			newsock->data = NULL;
			return(-ERESTARTSYS);
		}
	}
	sti();
    //連接失敗,三次握手失敗
	if (sk2->state != TCP_ESTABLISHED && sk2->err > 0) 
	{
		err = -sk2->err;
		sk2->err=0;
		sk2->dead=1;	/* ANK */
		destroy_sock(sk2);//銷燬新建的sock結構
		newsock->data = NULL;
		return(err);
	}
	newsock->state = SS_CONNECTED;//已經建立了連接
	return(0);
}
四、傳輸層——tcp_accept 函數

/*
 *	This will accept the next outstanding connection. 
 */
 //accept->sock_accpet->inet_accpet->tcp_accept(tcp)
 //頂層accept傳值進來的套接字sk是監聽套接字,然後返回可以進行數據通信的套接字
 //tcp_accept就是從監聽套接字緩存隊列裏面找到一個完成連接的套接字
static struct sock *tcp_accept(struct sock *sk, int flags)
{
	struct sock *newsk;
	struct sk_buff *skb;
  
  /*
   * We need to make sure that this socket is listening,
   * and that it has something pending.
   */

	if (sk->state != TCP_LISTEN) //如果當前不是出於監聽狀態就退出
	{
		sk->err = EINVAL;
		return(NULL); 
	}
    //套接字處於監聽狀態
	/* Avoid the race. */
	cli();
	sk->inuse = 1;//表示當前進程正在使用該sock結構,其餘進程不能使用,加鎖
	
  //從監聽套接字緩存隊列裏找到已經建立連接的套接字,並返回
	while((skb = tcp_dequeue_established(sk)) == NULL)
	{
	//如果沒有完成連接的,就一直陷入循環,然後重發back_log中的數據包
		if (flags & O_NONBLOCK) //不阻塞
		{
			sti();
//如果當前套接字正忙,數據包將插入到sock結構的back_log隊列中,back_log只是暫居之所
//數據包必須插入到receive_queue中才算被接收
			release_sock(sk);//從back_log中取數據包重新調用tcp_rcv函數對數據包進行接收
			sk->err = EAGAIN;
			return(NULL);
		}
     
		release_sock(sk);//從back_log中取數據包重新調用tcp_rcv函數對數據包進行接收
		interruptible_sleep_on(sk->sleep);
		if (current->signal & ~current->blocked) 
		{
			sti();
			sk->err = ERESTARTSYS;
			return(NULL);
		}
		sk->inuse = 1;//加鎖
  	}
	sti();

	/*
	 *	Now all we need to do is return skb->sk. 
	 */
	newsk = skb->sk;//返回的套接字(已完成連接)

	kfree_skb(skb, FREE_READ);//釋放sk_buff
	sk->ack_backlog--;//未應答數據包個數-1
	release_sock(sk);//原套接字繼續監聽
	return(newsk);
}
好,定位到 tcp_dqueue_established 函數:

/*
 *	Remove a completed connection and return it. This is used by
 *	tcp_accept() to get connections from the queue.
 */
//移除sock中一個已經建立連接的數據包,並返回該數據包
//結合前面可以看出,對於receive_queue,套接字連接的第1次握手時在該鏈表尾部增加一個鏈表節點,
//當第3次握手完成將此節點刪除,所以對於監聽套接字receive_queue中保存的是不完全建立連接的套接字的數據包
static struct sk_buff *tcp_dequeue_established(struct sock *s)
{
	struct sk_buff *skb;
	unsigned long flags;
	save_flags(flags);//保存狀態
	cli(); 
	skb=tcp_find_established(s);//找到已經建立連接的數據包
	if(skb!=NULL)
		skb_unlink(skb);	//從隊列中移除,但該數據報實體還是存在的/* Take it off the queue */
	restore_flags(flags);
	return skb;
}
好,再定位到 tcp_find_established 函數:

/*
 *	Find someone to 'accept'. Must be called with
 *	sk->inuse=1 or cli()
 */ 
 //sk_buff 表示接收或發送數據報的包頭信息
/*從監聽套接字緩衝隊列中檢查是否存在已經完成連接的遠端發送的數據包,該數據包的作用是完成連接。
本地監聽套接字在處理完該連接,設置相關狀態後將該數據包緩存在receive_queue中
*/
/*對於監聽套接字而言,其接收隊列中的數據包是建立連接數據包,即SYN數據包,不含數據數據包
*/
static struct sk_buff *tcp_find_established(struct sock *s)
{
//獲取receive_queue中的第一個鏈表元素,sk_buff 結構
	//該隊列中的數據報表示已被正式接收

	struct sk_buff *p=skb_peek(&s->receive_queue);//返回指向鏈表第一個節點的指針
	if(p==NULL)
		return NULL;
	do
	{
	//sk的狀態state是枚舉類
	//返回完成了三次握手連接的套接字
		if(p->sk->state == TCP_ESTABLISHED || p->sk->state >= TCP_FIN_WAIT1)
			return p;
		p=p->next;
	}
	while(p!=(struct sk_buff *)&s->receive_queue);//雙向鏈表,即遍歷整個隊列
	return NULL;
}
看到沒,這裏accept 函數返回的通信套接字是從監聽套接字的 receive_queue 隊列中獲得的,那麼通信套接字與監聽套接字的receive_queue隊列之間的關係,其中的細節又是什麼呢?

其內部實現在 tcp_conn_request 函數中啦:而這個函數在connect 函數下層函數被調用,服務器是被動打開的,即是客戶端主動連接。

/*
 *	This routine handles a connection request.
 *	It should make sure we haven't already responded.
 *	Because of the way BSD works, we have to send a syn/ack now.
 *	This also means it will be harder to close a socket which is
 *	listening.
 */
 /*
 參數中daddr,saddr的理解應從遠端角度出發。所以daddr表示本地地址;saddr表示遠端地址
 seq是函數調用tcp_init_seq()的返回值,表示本地初始化序列號;
 dev表示接收該數據包的接口設備
 */
 //tcp_rcv接收一個syn連接請求數據包後,將調用tcp_con_request函數進行具體處理
 //其內部邏輯很簡單:創建一個新的套接字用於通信,其本身繼續監聽客戶端的請求
 //創建一個新的套接字得設置其各個字段,這裏是複製監聽套接字的已有信息,然後根據需要修改部分
 //然後創建數據包,設置其TCP首部,創建MAC和IP首部,然後回送給客戶端,
 //並把該數據包插入到監聽套接字sk的recive_queue隊列中,該數據包已經關聯了新套接字,
 //在accept函數中,最後返回的通信套接字則是從這個隊列中獲得(參見tcp_accept函數)
static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
		 unsigned long daddr, unsigned long saddr,
		 struct options *opt, struct device *dev, unsigned long seq)
{
	struct sk_buff *buff;
	struct tcphdr *t1;
	unsigned char *ptr;
	struct sock *newsk;
	struct tcphdr *th;
	struct device *ndev=NULL;
	int tmp;
	struct rtable *rt;
  
	th = skb->h.th;//獲取tcp首部

	/* If the socket is dead, don't accept the connection. */
	//判斷套接字合法性
	if (!sk->dead) 
	{
  		sk->data_ready(sk,0);//通知睡眠進程,有數據到達
	}
	else //無效套接字,已經釋放了的
	{
		if(sk->debug)
			printk("Reset on %p: Connect on dead socket.\n",sk);
		tcp_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
		tcp_statistics.TcpAttemptFails++;
		kfree_skb(skb, FREE_READ);
		return;
	}

	/*
	 * Make sure we can accept more.  This will prevent a
	 * flurry of syns from eating up all our memory.
	 */
   //緩存的未應答數據包個數 >= 最大可緩存個數;表示滿了,已經不能接收了
	if (sk->ack_backlog >= sk->max_ack_backlog) 
	{
		tcp_statistics.TcpAttemptFails++;
		kfree_skb(skb, FREE_READ);
		return;
	}

	/*
	 * We need to build a new sock struct.
	 * It is sort of bad to have a socket without an inode attached
	 * to it, but the wake_up's will just wake up the listening socket,
	 * and if the listening socket is destroyed before this is taken
	 * off of the queue, this will take care of it.
	 */
  //創建一個新的套接字
	newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
	if (newsk == NULL) 
	{
		/* just ignore the syn.  It will get retransmitted. */
		tcp_statistics.TcpAttemptFails++;
		kfree_skb(skb, FREE_READ);
		return;
	}
   //複製一個套接字結構,即新的套接字中主要信息來源於監聽套接字中的已有信息
	memcpy(newsk, sk, sizeof(*newsk));
   //下面兩個就是把待發送和已接收隊列初始化成數據包鏈表形式
	skb_queue_head_init(&newsk->write_queue);
	skb_queue_head_init(&newsk->receive_queue);
	//下面是重發隊列
	newsk->send_head = NULL;
	newsk->send_tail = NULL;
	skb_queue_head_init(&newsk->back_log);//數據包暫存隊列(中轉站)
	//新套接字的字段設置
	newsk->rtt = 0;		/*TCP_CONNECT_TIME<<3*/
	newsk->rto = TCP_TIMEOUT_INIT;
	newsk->mdev = 0;
	newsk->max_window = 0;
	newsk->cong_window = 1;
	newsk->cong_count = 0;
	newsk->ssthresh = 0;
	newsk->backoff = 0;
	newsk->blog = 0;
	newsk->intr = 0;
	newsk->proc = 0;
	newsk->done = 0;
	newsk->partial = NULL;
	newsk->pair = NULL;
	newsk->wmem_alloc = 0;
	newsk->rmem_alloc = 0;
	newsk->localroute = sk->localroute;

	newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;

	newsk->err = 0;
	newsk->shutdown = 0;
	newsk->ack_backlog = 0;
	newsk->acked_seq = skb->h.th->seq+1;
	newsk->copied_seq = skb->h.th->seq+1;
	newsk->fin_seq = skb->h.th->seq;
	newsk->state = TCP_SYN_RECV;
	newsk->timeout = 0;
	newsk->ip_xmit_timeout = 0;
	newsk->write_seq = seq; 
	newsk->window_seq = newsk->write_seq;
	newsk->rcv_ack_seq = newsk->write_seq;
	newsk->urg_data = 0;
	newsk->retransmits = 0;
	newsk->linger=0;
	newsk->destroy = 0;
	init_timer(&newsk->timer);
	newsk->timer.data = (unsigned long)newsk;
	newsk->timer.function = &net_timer;
	init_timer(&newsk->retransmit_timer);
	newsk->retransmit_timer.data = (unsigned long)newsk;
	newsk->retransmit_timer.function=&retransmit_timer;
	newsk->dummy_th.source = skb->h.th->dest;
	newsk->dummy_th.dest = skb->h.th->source;
	
	/*
	 *	Swap these two, they are from our point of view. 
	 */
	 
	newsk->daddr = saddr;
	newsk->saddr = daddr;

	put_sock(newsk->num,newsk);//插入 array_sock 哈希表中
	newsk->dummy_th.res1 = 0;
	newsk->dummy_th.doff = 6;
	newsk->dummy_th.fin = 0;
	newsk->dummy_th.syn = 0;
	newsk->dummy_th.rst = 0;	
	newsk->dummy_th.psh = 0;
	newsk->dummy_th.ack = 0;
	newsk->dummy_th.urg = 0;
	newsk->dummy_th.res2 = 0;
	newsk->acked_seq = skb->h.th->seq + 1;//序列號設置
	newsk->copied_seq = skb->h.th->seq + 1;
	newsk->socket = NULL;

	/*
	 *	Grab the ttl and tos values and use them 
	 */

	newsk->ip_ttl=sk->ip_ttl;
	newsk->ip_tos=skb->ip_hdr->tos;

	/*
	 *	Use 512 or whatever user asked for 
	 */

	/*
	 * 	Note use of sk->user_mss, since user has no direct access to newsk 
	 */
    //ip路由表查找表項
	rt=ip_rt_route(saddr, NULL,NULL);
	
	if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
		newsk->window_clamp = rt->rt_window;
	else
		newsk->window_clamp = 0;
		
	if (sk->user_mss)
		newsk->mtu = sk->user_mss;
	else if(rt!=NULL && (rt->rt_flags&RTF_MSS))
		newsk->mtu = rt->rt_mss - HEADER_SIZE;
	else 
	{
#ifdef CONFIG_INET_SNARL	/* Sub Nets Are Local */
		if ((saddr ^ daddr) & default_mask(saddr))
#else
		if ((saddr ^ daddr) & dev->pa_mask)
#endif
			newsk->mtu = 576 - HEADER_SIZE;
		else
			newsk->mtu = MAX_WINDOW;
	}

	/*
	 *	But not bigger than device MTU 
	 */

	newsk->mtu = min(newsk->mtu, dev->mtu - HEADER_SIZE);

	/*
	 *	This will min with what arrived in the packet 
	 */

	tcp_options(newsk,skb->h.th);
 //服務器端創建新的套接字後,接下來就是創建一個數據包(syn+ack)回送過去
	buff = newsk->prot->wmalloc(newsk, MAX_SYN_SIZE, 1, GFP_ATOMIC);
	if (buff == NULL) 
	{
		sk->err = ENOMEM;
		newsk->dead = 1;
		newsk->state = TCP_CLOSE;
		/* And this will destroy it */
		release_sock(newsk);
		kfree_skb(skb, FREE_READ);
		tcp_statistics.TcpAttemptFails++;
		return;
	}
  //字段設置
	buff->len = sizeof(struct tcphdr)+4;
	buff->sk = newsk;//與新套接字關聯
	buff->localroute = newsk->localroute;

	t1 =(struct tcphdr *) buff->data;

	/*
	 *	Put in the IP header and routing stuff. 
	 */
  //MAC 頭+ ip 頭創建
	tmp = sk->prot->build_header(buff, newsk->saddr, newsk->daddr, &ndev,
			       IPPROTO_TCP, NULL, MAX_SYN_SIZE,sk->ip_tos,sk->ip_ttl);

	/*
	 *	Something went wrong. 
	 */

	if (tmp < 0) 
	{
		sk->err = tmp;
		buff->free = 1;
		kfree_skb(buff,FREE_WRITE);
		newsk->dead = 1;
		newsk->state = TCP_CLOSE;
		release_sock(newsk);
		skb->sk = sk;
		kfree_skb(skb, FREE_READ);
		tcp_statistics.TcpAttemptFails++;
		return;
	}

	buff->len += tmp;
	t1 =(struct tcphdr *)((char *)t1 +tmp);
  //tcp首部字段設置
	memcpy(t1, skb->h.th, sizeof(*t1));
	buff->h.seq = newsk->write_seq;
	/*
	 *	Swap the send and the receive. 
	 */
	t1->dest = skb->h.th->source;
	t1->source = newsk->dummy_th.source;
	t1->seq = ntohl(newsk->write_seq++);
	t1->ack = 1;//確認控制位,表示這是一個確認數據包
	newsk->window = tcp_select_window(newsk);
	newsk->sent_seq = newsk->write_seq;
	t1->window = ntohs(newsk->window);
	t1->res1 = 0;
	t1->res2 = 0;
	t1->rst = 0;
	t1->urg = 0;
	t1->psh = 0;
	t1->syn = 1;//同步控制位置位,和ack位一起作用
	t1->ack_seq = ntohl(skb->h.th->seq+1);//確認序列號=發過來的數據包的序列號+1
	t1->doff = sizeof(*t1)/4+1;
	ptr =(unsigned char *)(t1+1);
	ptr[0] = 2;
	ptr[1] = 4;
	ptr[2] = ((newsk->mtu) >> 8) & 0xff;
	ptr[3] =(newsk->mtu) & 0xff;

	//下面是tcp校驗和檢查
	tcp_send_check(t1, daddr, saddr, sizeof(*t1)+4, newsk);
	//調用_queue_xmit函數發送(前面介紹過)
	newsk->prot->queue_xmit(newsk, ndev, buff, 0);
	reset_xmit_timer(newsk, TIME_WRITE , TCP_TIMEOUT_INIT);
	skb->sk = newsk;//這裏數據包捆綁的就是新的套接字了

	/*
	 *	Charge the sock_buff to newsk. 
	 */
	//原監聽套接字接收隊列中存放的字節數減去該數據包大小
	//新創建的通信套接字則加上該數據包大小
	sk->rmem_alloc -= skb->mem_len;
	newsk->rmem_alloc += skb->mem_len;
	
	//把這個數據包插入到reveive_queue中,這個數據包的宿主是新套接字
	//在accept函數中,通信套接字則是從這個隊列中獲取
	skb_queue_tail(&sk->receive_queue,skb);
	sk->ack_backlog++;//緩存的數據包個數+1
	release_sock(newsk);
	tcp_statistics.TcpOutSegs++;
}
經過上面這個函數,我們得到了一個重要信息,就是客戶端向服務器發出連接請求後,服務器端新建了通信套接字和確認數據包,二者建立關聯,並把確認數據包插入到監聽套接字的receive_queue 隊列中,該數據包的宿主就是新建的通信套接字,而accept函數返回的通信套接字則在監聽套接字的 receive_queue 隊列中獲得。

好了,至此整個tcp連接建立過程算是剖析完畢了。







發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章