Linux Connect系統調用

說明:本文的分析基於linux-2.6.0版本

1、函數原型

man connect可以看到函數原型爲:

#include <sys/types.h>          /* See NOTES */
#include <sys/socket.h>
int connect(int sockfd, const struct sockaddr *addr,socklen_t addrlen);
  • sockfd: 套接字的文件描述符,socket()系統調用返回的fd
  • addr: 指向存放地址信息的結構體的首地址
  • addrlen: 存放地址信息的結構體的大小,其值爲sizof(struct sockaddr)

2、內核實現代碼

asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
	struct socket *sock;
	char address[MAX_SOCK_ADDR];
	int err;
        /* 通過文件描述符fd來獲取對應的socket結構體 */
	sock = sockfd_lookup(fd, &err);
	if (!sock)
		goto out;
	/* 將用戶空間地址拷貝到內核空間 */
	err = move_addr_to_kernel(uservaddr, addrlen, address);
	if (err < 0)
		goto out_put;
	/* 安全模塊的檢查 */
	err = security_socket_connect(sock, (struct sockaddr *)address, addrlen);
	if (err)
		goto out_put;
	/* 調用connect處理函數 */
        /* 對於TCP傳輸協議來說,最終調用的是inet_stream_ops中的inet_stream_connect()函數 */
	err = sock->ops->connect(sock, (struct sockaddr *) address, addrlen,
				 sock->file->f_flags);
out_put:
	sockfd_put(sock);
out:
	return err;
}
 //sock->ops指向的是inet_stream_ops,inet_stream_ops中綁定了inet_stream_connect以及inet_bind等函數,而inet_stream_ops聲明在結構體數組inetsw_array中,該數組在初始化的時候通過和傳輸層協議類型protocol來進行綁定。具體的綁定過程是在inet_init()函數中通過循環數組inetsw_array來調用inet_register_protosw()函數進行綁定。下面是inetsw_array數組以及TCP傳輸協議對應的inet_stream_ops結構。

static struct inet_protosw inetsw_array[] =
{
        {
                .type =       SOCK_STREAM,
                .protocol =   IPPROTO_TCP,
                .prot =       &tcp_prot,
                .ops =        &inet_stream_ops,
                .capability = -1,
                .no_check =   0,
                .flags =      INET_PROTOSW_PERMANENT,
        },

        {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_UDP,
                .prot =       &udp_prot,
                .ops =        &inet_dgram_ops,
                .capability = -1,
                .no_check =   UDP_CSUM_DEFAULT,
                .flags =      INET_PROTOSW_PERMANENT,
       },
        

       {
               .type =       SOCK_RAW,
               .protocol =   IPPROTO_IP,	/* wild card */
               .prot =       &raw_prot,
               .ops =        &inet_dgram_ops,
               .capability = CAP_NET_RAW,
               .no_check =   UDP_CSUM_DEFAULT,
               .flags =      INET_PROTOSW_REUSE,
       }
};

struct proto_ops inet_stream_ops = {
	.family =	PF_INET,
	.owner =	THIS_MODULE,
	.release =	inet_release,
	.bind =		inet_bind,
	.connect =	inet_stream_connect,
	.socketpair =	sock_no_socketpair,
	.accept =	inet_accept,
	.getname =	inet_getname,
	.poll =		tcp_poll,
	.ioctl =	inet_ioctl,
	.listen =	inet_listen,
	.shutdown =	inet_shutdown,
	.setsockopt =	inet_setsockopt,
	.getsockopt =	inet_getsockopt,
	.sendmsg =	inet_sendmsg,
	.recvmsg =	inet_recvmsg,
	.mmap =		sock_no_mmap,
	.sendpage =	tcp_sendpage
};

3、inet_stream_connect函數

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
			int addr_len, int flags)
{
	struct sock *sk = sock->sk;
	int err;
	long timeo;

	lock_sock(sk);

    /* 對unspec的特殊處理 */
    /* sa_family參數指定調用者期望返回的套接口地址結構類型,如果指定爲AF_INET,那麼函數
    就不能返回任何IPV6相關的地址信息,如果僅指定了AF_INET6,則不能返回任何IPV4的地址信息,AF_UNSPEC則意味着函數返回的是適用於指定主機名和服務名且適合任何協議族的地址。*/
	if (uaddr->sa_family == AF_UNSPEC) {
		err = sk->sk_prot->disconnect(sk, flags);
		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
		goto out;
	}

    /* 根據socket狀態對應處理 */
	switch (sock->state) {
	default:
		err = -EINVAL;
		goto out;
    /* 已連接 */
	case SS_CONNECTED:
		err = -EISCONN;
		goto out;
    /* 正在連接 */
    case SS_CONNECTING:
		err = -EALREADY;
		/* Fall out of switch with err, set for this state */
		break;
    /* 未連接 */
	case SS_UNCONNECTED:
		err = -EISCONN;
        /* 需要爲closed狀態 */
		if (sk->sk_state != TCP_CLOSE)
			goto out;
        /* 同樣通過inetsw_array的綁定關係,最終調用的是tcp_v4_connect()主處理函數 */
		err = sk->sk_prot->connect(sk, uaddr, addr_len);
		if (err < 0)
			goto out;
        /* 標記狀態爲正在連接 */
  		sock->state = SS_CONNECTING;

		/* Just entered SS_CONNECTING state; the only
		 * difference is that return value in non-blocking
		 * case is EINPROGRESS, rather than EALREADY.
		 */
		/* 如果是非阻塞情況,則返回的錯誤碼就是EINPROGRESS */
		err = -EINPROGRESS;
		break;
	}
    /* 阻塞情況下需要獲取超時時間,非阻塞爲0 */
	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

    /* 已發送或者已收到syn */
	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
		/* Error code is set above */
        /* 非阻塞退出,阻塞則等待連接,等待剩餘時間爲0,退出 */
		if (!timeo || !inet_wait_for_connect(sk, timeo))
			goto out;
        /* 處理信號,達到最大調度時間或者被打斷 */
		err = sock_intr_errno(timeo);
		if (signal_pending(current))
			goto out;
	}

	/* Connection was closed by RST, timeout, ICMP error
	 * or another process disconnected us.
	 */
	/* 狀態爲關閉 */
	if (sk->sk_state == TCP_CLOSE)
		goto sock_error;

	/* sk->sk_err may be not zero now, if RECVERR was ordered by user
	 * and error was received after socket entered established state.
	 * Hence, it is handled normally after connect() return successfully.
	 */
    /* 設置爲連接狀態 */
	sock->state = SS_CONNECTED;
	err = 0;
out:
	release_sock(sk);
	return err;

sock_error:
	err = sock_error(sk) ? : -ECONNABORTED;
    /* 設置未連接狀態 */
	sock->state = SS_UNCONNECTED;
	if (sk->sk_prot->disconnect(sk, flags))
		sock->state = SS_DISCONNECTING;
	goto out;
}

4、tcp_v4_connect函數

int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	struct inet_opt *inet = inet_sk(sk);
	struct tcp_opt *tp = tcp_sk(sk);
	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
	struct rtable *rt;
	u32 daddr, nexthop;
	int tmp;
	int err;
	/* 判斷地址長度是否合法,應該要大於或者等於其地址的長度 */
	if (addr_len < sizeof(struct sockaddr_in))
		return -EINVAL;
        /* 判斷是否爲inet協議族 */
	if (usin->sin_family != AF_INET)
		return -EAFNOSUPPORT;
        /* 下一跳地址和目的地址 */
	nexthop = daddr = usin->sin_addr.s_addr;
	if (inet->opt && inet->opt->srr) {
		if (!daddr)
			return -EINVAL;
		nexthop = inet->opt->faddr;
	}
    /* 根據當前信息,查找路由,並新建路由緩存 */
	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
			       IPPROTO_TCP,
			       inet->sport, usin->sin_port, sk);
	if (tmp < 0)
		return tmp;

	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
		ip_rt_put(rt);
		return -ENETUNREACH;
	}
	
	if (!inet->opt || !inet->opt->srr)
		daddr = rt->rt_dst;

	if (!inet->saddr)
        /* 如果socket沒有綁定ip地址,使用路由查詢返回的結果。客戶端一般不會綁定ip地址 */
		inet->saddr = rt->rt_src;
    /* inet->rcv_saddr表示的是本地綁定的ip地址,也就是源地址 */
	inet->rcv_saddr = inet->saddr;

	if (tp->ts_recent_stamp && inet->daddr != daddr) {
		/* Reset inherited state */
		tp->ts_recent	    = 0;
		tp->ts_recent_stamp = 0;
		tp->write_seq	    = 0;
	}

	if (sysctl_tcp_tw_recycle &&
	    !tp->ts_recent_stamp && rt->rt_dst == daddr) {
		struct inet_peer *peer = rt_get_peer(rt);

		/* VJ's idea. We save last timestamp seen from
		 * the destination in peer table, when entering state TIME-WAIT
		 * and initialize ts_recent from it, when trying new connection.
		 */

		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
			tp->ts_recent_stamp = peer->tcp_ts_stamp;
			tp->ts_recent = peer->tcp_ts;
		}
	}
    /* 目的端口和目的地址 */
	inet->dport = usin->sin_port;
	inet->daddr = daddr;

	tp->ext_header_len = 0;
	if (inet->opt)
		tp->ext_header_len = inet->opt->optlen;

	tp->mss_clamp = 536;

	/* Socket identity is still unknown (sport may be zero).
	 * However we set state to SYN-SENT and not releasing socket
	 * lock select source port, enter ourselves into the hash tables and
	 * complete initialization after this.
	 */
	/* 設置socket爲SYN_SENT狀態 */
	tcp_set_state(sk, TCP_SYN_SENT);
    /* 綁定ip和端口號,並將sock加入哈希表中 */
	err = tcp_v4_hash_connect(sk);
	if (err)
		goto failure;
    /* 使用新的端口號再次做路由查詢,因爲如果客戶端沒有用bind()
    綁定IP地址和端口號,上面tcp_v4_hash_connect()就會自動選擇一
    個端口號,因此源端口會不一樣 */
	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
	if (err)
		goto failure;

	/* OK, now commit destination to socket.  */
	__sk_dst_set(sk, &rt->u.dst);
	tcp_v4_setup_caps(sk, &rt->u.dst);
	tp->ext2_header_len = rt->u.dst.header_len;

	if (!tp->write_seq)
        /* 生成序列號 */
		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
							   inet->daddr,
							   inet->sport,
							   usin->sin_port);

	inet->id = tp->write_seq ^ jiffies;
    /* 由socket層轉入TCP層,構造SYN報文併發送 */
	err = tcp_connect(sk);
	rt = NULL;
	if (err)
		goto failure;

	return 0;

failure:
	/* This unhashes the socket and releases the local port, if necessary. */
	/* 如果連接失敗,則需把套接字狀態設置爲:TCP_CLOSE */
	tcp_set_state(sk, TCP_CLOSE);
	ip_rt_put(rt);
	sk->sk_route_caps = 0;
	inet->dport = 0;
	return err;
}
一般情況下,客戶端發起的連接都不會調用bind()函數來綁定ip和端口號,這個動作交由系統自動處理,其實就是在調用connect()時,由**tcp_v4_hash_connect()**來完成ip和端口的綁定過程。

5、tcp_v4_hash_connect函數

static int tcp_v4_hash_connect(struct sock *sk)
{
	unsigned short snum = inet_sk(sk)->num;
 	struct tcp_bind_hashbucket *head;
 	struct tcp_bind_bucket *tb;
	int ret;
    /* 如果沒有綁定端口 */
 	if (!snum) {
 		int rover;
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
		struct hlist_node *node;
 		struct tcp_tw_bucket *tw = NULL;

 		local_bh_disable();

 		/* TODO. Actually it is not so bad idea to remove
 		 * tcp_portalloc_lock before next submission to Linus.
 		 * As soon as we touch this place at all it is time to think.
 		 *
 		 * Now it protects single _advisory_ variable tcp_port_rover,
 		 * hence it is mostly useless.
 		 * Code will work nicely if we just delete it, but
 		 * I am afraid in contented case it will work not better or
 		 * even worse: another cpu just will hit the same bucket
 		 * and spin there.
 		 * So some cpu salt could remove both contention and
 		 * memory pingpong. Any ideas how to do this in a nice way?
 		 */
 		spin_lock(&tcp_portalloc_lock);
 		rover = tcp_port_rover;

 		do {
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
 			head = &tcp_bhash[tcp_bhashfn(rover)];
 			spin_lock(&head->lock);

 			/* Does not bother with rcv_saddr checks,
 			 * because the established check is already
 			 * unique enough.
 			 */
 			/* 檢查該端口是否會和bind哈希表的連接衝突 */
			tb_for_each(tb, node, &head->chain) {
 				if (tb->port == rover) {
 					BUG_TRAP(!hlist_empty(&tb->owners));
 					if (tb->fastreuse >= 0)
 						goto next_port;
                    /* 如果和bind哈希表不衝突,再檢查是否和established以及time_wait表衝突 */
 					if (!__tcp_v4_check_established(sk,
									rover,
									&tw))
 						goto ok;
 					goto next_port;
 				}
 			}
            /* 當前端口不衝突,可以使用時,創建inet_bind_bucket保存ip/port信息 */
 			tb = tcp_bucket_create(head, rover);
 			if (!tb) {
 				spin_unlock(&head->lock);
 				break;
 			}
            /* 設置地址端口重用參數爲-1 */
 			tb->fastreuse = -1;
 			goto ok;

 		next_port:
 			spin_unlock(&head->lock);
 		} while (--remaining > 0);
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);

 		local_bh_enable();

 		return -EADDRNOTAVAIL;

ok:
 		/* All locks still held and bhs disabled */
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);
        /* 加入bind哈希表 */
 		tcp_bind_hash(sk, tb, rover);
		if (sk_unhashed(sk)) {
            /* 設置該sk的源端口 */
 			inet_sk(sk)->sport = htons(rover);
 			__tcp_v4_hash(sk, 0);
 		}
 		spin_unlock(&head->lock);

 		if (tw) {
            /* 將bind哈希表和established哈希表中的該tw sock刪除 */
 			tcp_tw_deschedule(tw);
 			tcp_tw_put(tw);
 		}

		ret = 0;
		goto out;
 	}
    /* 如果綁定了端口 */
 	head  = &tcp_bhash[tcp_bhashfn(snum)];
 	tb  = tcp_sk(sk)->bind_hash;
	spin_lock_bh(&head->lock);
	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
        /* 綁定該端口的連接只有當前這一個,肯定不會衝突,直接使用 */
		__tcp_v4_hash(sk, 0);
		spin_unlock_bh(&head->lock);
		return 0;
	} else {
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
        /* 檢查是否和established表衝突 */
		ret = __tcp_v4_check_established(sk, snum, NULL);
out:
		local_bh_enable();
		return ret;
	}
}
綁定端口的時候就和bind()系統調用相似,要考慮系統選擇的端口是否會有衝突,這其中涉及幾條鏈表的衝突判斷。第一條自然是bind哈希表,然後是established哈希表,在後續的內核版本中將time_wait獨立爲一個哈希表,因此也要檢查這個time_wait哈希表。當前版本內核仍然是一個鏈表。在這就是通過__tcp_v4_check_established來判斷是否和established或者time_wait連接衝突的。

6、__tcp_v4_check_established函數

static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
				      struct tcp_tw_bucket **twp)
{
	struct inet_opt *inet = inet_sk(sk);
	u32 daddr = inet->rcv_saddr;
	u32 saddr = inet->daddr;
	int dif = sk->sk_bound_dev_if;
	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
    /* 根據四元組獲取established哈希表 */
	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
	struct sock *sk2;
	struct hlist_node *node;
	struct tcp_tw_bucket *tw;

	write_lock(&head->lock);

	/* Check TIME-WAIT sockets first. */
	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
		tw = (struct tcp_tw_bucket *)sk2;
        /* socket四元組信息等是否相同 */
		if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
			struct tcp_opt *tp = tcp_sk(sk);

			/* With PAWS, it is safe from the viewpoint
			   of data integrity. Even without PAWS it
			   is safe provided sequence spaces do not
			   overlap i.e. at data rates <= 80Mbit/sec.

			   Actually, the idea is close to VJ's one,
			   only timestamp cache is held not per host,
			   but per port pair and TW bucket is used
			   as state holder.

			   If TW bucket has been already destroyed we
			   fall back to VJ's scheme and use initial
			   timestamp retrieved from peer table.
			 */
			/* 如果各信息相同,考慮time_wait的複用 */
			if (tw->tw_ts_recent_stamp &&
			    (!twp || (sysctl_tcp_tw_reuse &&
				      xtime.tv_sec -
				      tw->tw_ts_recent_stamp > 1))) {
				/* 檢查time_wait狀態的連接是否可以重用 */
				if ((tp->write_seq =
						tw->tw_snd_nxt + 65535 + 2) == 0)
					tp->write_seq = 1;
                /* 不衝突,複用該連接的一些ip選項信息 */
				tp->ts_recent	    = tw->tw_ts_recent;
				tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
				sock_hold(sk2);
				goto unique;
			} else
				goto not_unique;
		}
	}
	tw = NULL;

	/* And established part... */
	sk_for_each(sk2, node, &head->chain) {
		if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
			goto not_unique;
	}

unique:
	/* Must record num and sport now. Otherwise we will see
	 * in hash table socket with a funny identity. */
	/* 設置綁定端口號和源端口 */
	inet->num = lport;
	inet->sport = htons(lport);
	sk->sk_hashent = hash;
	BUG_TRAP(sk_unhashed(sk));
	__sk_add_node(sk, &head->chain);
    /* 加入到established表中 */
	sock_prot_inc_use(sk->sk_prot);
	write_unlock(&head->lock);

	if (twp) {
		*twp = tw;
		NET_INC_STATS_BH(TimeWaitRecycled);
	} else if (tw) {
		/* Silly. Should hash-dance instead... */
		tcp_tw_deschedule(tw);
		NET_INC_STATS_BH(TimeWaitRecycled);

		tcp_tw_put(tw);
	}

	return 0;

not_unique:
	write_unlock(&head->lock);
	return -EADDRNOTAVAIL;
}
在完成ip和端口的綁定之後,再回到tcp_v4_connect()函數中。接下來就是發送SYN報文,也就是調用tcp_connect函數來進行發送了。

7、tcp_connect函數

int tcp_connect(struct sock *sk)
{
	struct tcp_opt *tp = tcp_sk(sk);
	struct sk_buff *buff;
    /* 進行sock參數的初始化 */
	tcp_connect_init(sk);
    /* 分配skb結構體 */
	buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
	if (unlikely(buff == NULL))
		return -ENOBUFS;

	/* Reserve space for headers. */
	skb_reserve(buff, MAX_TCP_HEADER);
    /* 初始化skb */
	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
	TCP_ECN_send_syn(sk, tp, buff);
	TCP_SKB_CB(buff)->sacked = 0;
	buff->csum = 0;
	TCP_SKB_CB(buff)->seq = tp->write_seq++;
	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
	tp->snd_nxt = tp->write_seq;
	tp->pushed_seq = tp->write_seq;

	/* Send it off. */
    /* 發包時間 */
	TCP_SKB_CB(buff)->when = tcp_time_stamp;
	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
	__skb_queue_tail(&sk->sk_write_queue, buff);
	tcp_charge_skb(sk, buff);
	tp->packets_out++;
    /* 發送syn包 */
	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
	TCP_INC_STATS(TcpActiveOpens);

	/* Timer for repeating the SYN until an answer. */
    /* 開啓重傳定時器 */
	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
	return 0;
}
參數初始化完成之後,調用skb_clone分配一個buff來裝在syn報文,然後調用tcp_transmit_skb函數來發送syn報文。

8、tcp_transmit_skb函數

int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
	if(skb != NULL) {
		struct inet_opt *inet = inet_sk(sk);
		struct tcp_opt *tp = tcp_sk(sk);
		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
		int tcp_header_size = tp->tcp_header_len;
		struct tcphdr *th;
		int sysctl_flags;
		int err;

#define SYSCTL_FLAG_TSTAMPS	0x1
#define SYSCTL_FLAG_WSCALE	0x2
#define SYSCTL_FLAG_SACK	0x4

		sysctl_flags = 0;
		if (tcb->flags & TCPCB_FLAG_SYN) {
            /* 爲syn報文構建tcp選項 */
			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
			if(sysctl_tcp_timestamps) {
				tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
				sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
			}
			if(sysctl_tcp_window_scaling) {
				tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
				sysctl_flags |= SYSCTL_FLAG_WSCALE;
			}
			if(sysctl_tcp_sack) {
				sysctl_flags |= SYSCTL_FLAG_SACK;
				if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
					tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
			}
		} else if (tp->eff_sacks) {
			/* A SACK is 2 pad bytes, a 2 byte header, plus
			 * 2 32-bit sequence numbers for each SACK block.
			 */
			 /* 爲已建立連接的socket構建tcp選項 */
			tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
					    (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
		}
		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
		skb->h.th = th;
		skb_set_owner_w(skb, sk);

		/* Build TCP header and checksum it. */
        /* 構建tcp頭和校驗和 */
        /* 源端口、目的端口、序列號、ack號 */
		th->source		= inet->sport;
		th->dest		= inet->dport;
		th->seq			= htonl(tcb->seq);
		th->ack_seq		= htonl(tp->rcv_nxt);
		*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | tcb->flags);
		if (tcb->flags & TCPCB_FLAG_SYN) {
			/* RFC1323: The window in SYN & SYN/ACK segments
			 * is never scaled.
			 */
			th->window	= htons(tp->rcv_wnd);
		} else {
			th->window	= htons(tcp_select_window(sk));
		}
        /* 校驗和、緊急指針 */
		th->check		= 0;
		th->urg_ptr		= 0;
        
		if (tp->urg_mode &&
		    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
			th->urg_ptr		= htons(tp->snd_up-tcb->seq);
			th->urg			= 1;
		}
        /* tcp選項 */
		if (tcb->flags & TCPCB_FLAG_SYN) {
			tcp_syn_build_options((__u32 *)(th + 1),
					      tcp_advertise_mss(sk),
					      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
					      (sysctl_flags & SYSCTL_FLAG_SACK),
					      (sysctl_flags & SYSCTL_FLAG_WSCALE),
					      tp->rcv_wscale,
					      tcb->when,
		      			      tp->ts_recent);
		} else {
			tcp_build_and_update_options((__u32 *)(th + 1),
						     tp, tcb->when);

			TCP_ECN_send(sk, tp, skb, tcp_header_size);
		}
        
		tp->af_specific->send_check(sk, th, skb->len, skb);
        /* 判斷是否開啓ACK延遲定時器 */
		if (tcb->flags & TCPCB_FLAG_ACK)
			tcp_event_ack_sent(sk);

		if (skb->len != tcp_header_size)
			tcp_event_data_sent(tp, skb, sk);

		TCP_INC_STATS(TcpOutSegs);
        /* 發往ip層處理了,調用ip_queue_xmit() */
		err = tp->af_specific->queue_xmit(skb, 0);
		if (err <= 0)
			return err;

		tcp_enter_cwr(tp);

		/* NET_XMIT_CN is special. It does not guarantee,
		 * that this packet is lost. It tells that device
		 * is about to start to drop packets or already
		 * drops some packets of the same priority and
		 * invokes us to send less aggressively.
		 */
		return err == NET_XMIT_CN ? 0 : err;
	}
	return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}
//tcp_transmit_skb函數主要是構造TCP的頭部以及TCP相關選項等,這樣TCP層的處理就完成了,調用tp->af_specific->queue_xmit,發往ip層繼續處理。tp->af_specific指向ipv4_specific。在tcp_v4_init_sock初始化函數中將tp->af_specific綁定到ip_queue_xmit函數。ipv4_specific結構如下:
struct tcp_func ipv4_specific = {
	.queue_xmit	=	ip_queue_xmit,
	.send_check	=	tcp_v4_send_check,
	.rebuild_header	=	tcp_v4_rebuild_header,
	.conn_request	=	tcp_v4_conn_request,
	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
	.remember_stamp	=	tcp_v4_remember_stamp,
	.net_header_len	=	sizeof(struct iphdr),
	.setsockopt	=	ip_setsockopt,
	.getsockopt	=	ip_getsockopt,
	.addr2sockaddr	=	v4_addr2sockaddr,
	.sockaddr_len	=	sizeof(struct sockaddr_in),
};
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章