Linux Connect系统调用

说明:本文的分析基于linux-2.6.0版本

1、函数原型

man connect可以看到函数原型为:

#include <sys/types.h>          /* See NOTES */
#include <sys/socket.h>
int connect(int sockfd, const struct sockaddr *addr,socklen_t addrlen);
  • sockfd: 套接字的文件描述符,socket()系统调用返回的fd
  • addr: 指向存放地址信息的结构体的首地址
  • addrlen: 存放地址信息的结构体的大小,其值为sizof(struct sockaddr)

2、内核实现代码

asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
	struct socket *sock;
	char address[MAX_SOCK_ADDR];
	int err;
        /* 通过文件描述符fd来获取对应的socket结构体 */
	sock = sockfd_lookup(fd, &err);
	if (!sock)
		goto out;
	/* 将用户空间地址拷贝到内核空间 */
	err = move_addr_to_kernel(uservaddr, addrlen, address);
	if (err < 0)
		goto out_put;
	/* 安全模块的检查 */
	err = security_socket_connect(sock, (struct sockaddr *)address, addrlen);
	if (err)
		goto out_put;
	/* 调用connect处理函数 */
        /* 对于TCP传输协议来说,最终调用的是inet_stream_ops中的inet_stream_connect()函数 */
	err = sock->ops->connect(sock, (struct sockaddr *) address, addrlen,
				 sock->file->f_flags);
out_put:
	sockfd_put(sock);
out:
	return err;
}
 //sock->ops指向的是inet_stream_ops,inet_stream_ops中绑定了inet_stream_connect以及inet_bind等函数,而inet_stream_ops声明在结构体数组inetsw_array中,该数组在初始化的时候通过和传输层协议类型protocol来进行绑定。具体的绑定过程是在inet_init()函数中通过循环数组inetsw_array来调用inet_register_protosw()函数进行绑定。下面是inetsw_array数组以及TCP传输协议对应的inet_stream_ops结构。

static struct inet_protosw inetsw_array[] =
{
        {
                .type =       SOCK_STREAM,
                .protocol =   IPPROTO_TCP,
                .prot =       &tcp_prot,
                .ops =        &inet_stream_ops,
                .capability = -1,
                .no_check =   0,
                .flags =      INET_PROTOSW_PERMANENT,
        },

        {
                .type =       SOCK_DGRAM,
                .protocol =   IPPROTO_UDP,
                .prot =       &udp_prot,
                .ops =        &inet_dgram_ops,
                .capability = -1,
                .no_check =   UDP_CSUM_DEFAULT,
                .flags =      INET_PROTOSW_PERMANENT,
       },
        

       {
               .type =       SOCK_RAW,
               .protocol =   IPPROTO_IP,	/* wild card */
               .prot =       &raw_prot,
               .ops =        &inet_dgram_ops,
               .capability = CAP_NET_RAW,
               .no_check =   UDP_CSUM_DEFAULT,
               .flags =      INET_PROTOSW_REUSE,
       }
};

struct proto_ops inet_stream_ops = {
	.family =	PF_INET,
	.owner =	THIS_MODULE,
	.release =	inet_release,
	.bind =		inet_bind,
	.connect =	inet_stream_connect,
	.socketpair =	sock_no_socketpair,
	.accept =	inet_accept,
	.getname =	inet_getname,
	.poll =		tcp_poll,
	.ioctl =	inet_ioctl,
	.listen =	inet_listen,
	.shutdown =	inet_shutdown,
	.setsockopt =	inet_setsockopt,
	.getsockopt =	inet_getsockopt,
	.sendmsg =	inet_sendmsg,
	.recvmsg =	inet_recvmsg,
	.mmap =		sock_no_mmap,
	.sendpage =	tcp_sendpage
};

3、inet_stream_connect函数

int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
			int addr_len, int flags)
{
	struct sock *sk = sock->sk;
	int err;
	long timeo;

	lock_sock(sk);

    /* 对unspec的特殊处理 */
    /* sa_family参数指定调用者期望返回的套接口地址结构类型,如果指定为AF_INET,那么函数
    就不能返回任何IPV6相关的地址信息,如果仅指定了AF_INET6,则不能返回任何IPV4的地址信息,AF_UNSPEC则意味着函数返回的是适用于指定主机名和服务名且适合任何协议族的地址。*/
	if (uaddr->sa_family == AF_UNSPEC) {
		err = sk->sk_prot->disconnect(sk, flags);
		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
		goto out;
	}

    /* 根据socket状态对应处理 */
	switch (sock->state) {
	default:
		err = -EINVAL;
		goto out;
    /* 已连接 */
	case SS_CONNECTED:
		err = -EISCONN;
		goto out;
    /* 正在连接 */
    case SS_CONNECTING:
		err = -EALREADY;
		/* Fall out of switch with err, set for this state */
		break;
    /* 未连接 */
	case SS_UNCONNECTED:
		err = -EISCONN;
        /* 需要为closed状态 */
		if (sk->sk_state != TCP_CLOSE)
			goto out;
        /* 同样通过inetsw_array的绑定关系,最终调用的是tcp_v4_connect()主处理函数 */
		err = sk->sk_prot->connect(sk, uaddr, addr_len);
		if (err < 0)
			goto out;
        /* 标记状态为正在连接 */
  		sock->state = SS_CONNECTING;

		/* Just entered SS_CONNECTING state; the only
		 * difference is that return value in non-blocking
		 * case is EINPROGRESS, rather than EALREADY.
		 */
		/* 如果是非阻塞情况,则返回的错误码就是EINPROGRESS */
		err = -EINPROGRESS;
		break;
	}
    /* 阻塞情况下需要获取超时时间,非阻塞为0 */
	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

    /* 已发送或者已收到syn */
	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
		/* Error code is set above */
        /* 非阻塞退出,阻塞则等待连接,等待剩余时间为0,退出 */
		if (!timeo || !inet_wait_for_connect(sk, timeo))
			goto out;
        /* 处理信号,达到最大调度时间或者被打断 */
		err = sock_intr_errno(timeo);
		if (signal_pending(current))
			goto out;
	}

	/* Connection was closed by RST, timeout, ICMP error
	 * or another process disconnected us.
	 */
	/* 状态为关闭 */
	if (sk->sk_state == TCP_CLOSE)
		goto sock_error;

	/* sk->sk_err may be not zero now, if RECVERR was ordered by user
	 * and error was received after socket entered established state.
	 * Hence, it is handled normally after connect() return successfully.
	 */
    /* 设置为连接状态 */
	sock->state = SS_CONNECTED;
	err = 0;
out:
	release_sock(sk);
	return err;

sock_error:
	err = sock_error(sk) ? : -ECONNABORTED;
    /* 设置未连接状态 */
	sock->state = SS_UNCONNECTED;
	if (sk->sk_prot->disconnect(sk, flags))
		sock->state = SS_DISCONNECTING;
	goto out;
}

4、tcp_v4_connect函数

int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	struct inet_opt *inet = inet_sk(sk);
	struct tcp_opt *tp = tcp_sk(sk);
	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
	struct rtable *rt;
	u32 daddr, nexthop;
	int tmp;
	int err;
	/* 判断地址长度是否合法,应该要大于或者等于其地址的长度 */
	if (addr_len < sizeof(struct sockaddr_in))
		return -EINVAL;
        /* 判断是否为inet协议族 */
	if (usin->sin_family != AF_INET)
		return -EAFNOSUPPORT;
        /* 下一跳地址和目的地址 */
	nexthop = daddr = usin->sin_addr.s_addr;
	if (inet->opt && inet->opt->srr) {
		if (!daddr)
			return -EINVAL;
		nexthop = inet->opt->faddr;
	}
    /* 根据当前信息,查找路由,并新建路由缓存 */
	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
			       IPPROTO_TCP,
			       inet->sport, usin->sin_port, sk);
	if (tmp < 0)
		return tmp;

	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
		ip_rt_put(rt);
		return -ENETUNREACH;
	}
	
	if (!inet->opt || !inet->opt->srr)
		daddr = rt->rt_dst;

	if (!inet->saddr)
        /* 如果socket没有绑定ip地址,使用路由查询返回的结果。客户端一般不会绑定ip地址 */
		inet->saddr = rt->rt_src;
    /* inet->rcv_saddr表示的是本地绑定的ip地址,也就是源地址 */
	inet->rcv_saddr = inet->saddr;

	if (tp->ts_recent_stamp && inet->daddr != daddr) {
		/* Reset inherited state */
		tp->ts_recent	    = 0;
		tp->ts_recent_stamp = 0;
		tp->write_seq	    = 0;
	}

	if (sysctl_tcp_tw_recycle &&
	    !tp->ts_recent_stamp && rt->rt_dst == daddr) {
		struct inet_peer *peer = rt_get_peer(rt);

		/* VJ's idea. We save last timestamp seen from
		 * the destination in peer table, when entering state TIME-WAIT
		 * and initialize ts_recent from it, when trying new connection.
		 */

		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
			tp->ts_recent_stamp = peer->tcp_ts_stamp;
			tp->ts_recent = peer->tcp_ts;
		}
	}
    /* 目的端口和目的地址 */
	inet->dport = usin->sin_port;
	inet->daddr = daddr;

	tp->ext_header_len = 0;
	if (inet->opt)
		tp->ext_header_len = inet->opt->optlen;

	tp->mss_clamp = 536;

	/* Socket identity is still unknown (sport may be zero).
	 * However we set state to SYN-SENT and not releasing socket
	 * lock select source port, enter ourselves into the hash tables and
	 * complete initialization after this.
	 */
	/* 设置socket为SYN_SENT状态 */
	tcp_set_state(sk, TCP_SYN_SENT);
    /* 绑定ip和端口号,并将sock加入哈希表中 */
	err = tcp_v4_hash_connect(sk);
	if (err)
		goto failure;
    /* 使用新的端口号再次做路由查询,因为如果客户端没有用bind()
    绑定IP地址和端口号,上面tcp_v4_hash_connect()就会自动选择一
    个端口号,因此源端口会不一样 */
	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
	if (err)
		goto failure;

	/* OK, now commit destination to socket.  */
	__sk_dst_set(sk, &rt->u.dst);
	tcp_v4_setup_caps(sk, &rt->u.dst);
	tp->ext2_header_len = rt->u.dst.header_len;

	if (!tp->write_seq)
        /* 生成序列号 */
		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
							   inet->daddr,
							   inet->sport,
							   usin->sin_port);

	inet->id = tp->write_seq ^ jiffies;
    /* 由socket层转入TCP层,构造SYN报文并发送 */
	err = tcp_connect(sk);
	rt = NULL;
	if (err)
		goto failure;

	return 0;

failure:
	/* This unhashes the socket and releases the local port, if necessary. */
	/* 如果连接失败,则需把套接字状态设置为:TCP_CLOSE */
	tcp_set_state(sk, TCP_CLOSE);
	ip_rt_put(rt);
	sk->sk_route_caps = 0;
	inet->dport = 0;
	return err;
}
一般情况下,客户端发起的连接都不会调用bind()函数来绑定ip和端口号,这个动作交由系统自动处理,其实就是在调用connect()时,由**tcp_v4_hash_connect()**来完成ip和端口的绑定过程。

5、tcp_v4_hash_connect函数

static int tcp_v4_hash_connect(struct sock *sk)
{
	unsigned short snum = inet_sk(sk)->num;
 	struct tcp_bind_hashbucket *head;
 	struct tcp_bind_bucket *tb;
	int ret;
    /* 如果没有绑定端口 */
 	if (!snum) {
 		int rover;
 		int low = sysctl_local_port_range[0];
 		int high = sysctl_local_port_range[1];
 		int remaining = (high - low) + 1;
		struct hlist_node *node;
 		struct tcp_tw_bucket *tw = NULL;

 		local_bh_disable();

 		/* TODO. Actually it is not so bad idea to remove
 		 * tcp_portalloc_lock before next submission to Linus.
 		 * As soon as we touch this place at all it is time to think.
 		 *
 		 * Now it protects single _advisory_ variable tcp_port_rover,
 		 * hence it is mostly useless.
 		 * Code will work nicely if we just delete it, but
 		 * I am afraid in contented case it will work not better or
 		 * even worse: another cpu just will hit the same bucket
 		 * and spin there.
 		 * So some cpu salt could remove both contention and
 		 * memory pingpong. Any ideas how to do this in a nice way?
 		 */
 		spin_lock(&tcp_portalloc_lock);
 		rover = tcp_port_rover;

 		do {
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
 			head = &tcp_bhash[tcp_bhashfn(rover)];
 			spin_lock(&head->lock);

 			/* Does not bother with rcv_saddr checks,
 			 * because the established check is already
 			 * unique enough.
 			 */
 			/* 检查该端口是否会和bind哈希表的连接冲突 */
			tb_for_each(tb, node, &head->chain) {
 				if (tb->port == rover) {
 					BUG_TRAP(!hlist_empty(&tb->owners));
 					if (tb->fastreuse >= 0)
 						goto next_port;
                    /* 如果和bind哈希表不冲突,再检查是否和established以及time_wait表冲突 */
 					if (!__tcp_v4_check_established(sk,
									rover,
									&tw))
 						goto ok;
 					goto next_port;
 				}
 			}
            /* 当前端口不冲突,可以使用时,创建inet_bind_bucket保存ip/port信息 */
 			tb = tcp_bucket_create(head, rover);
 			if (!tb) {
 				spin_unlock(&head->lock);
 				break;
 			}
            /* 设置地址端口重用参数为-1 */
 			tb->fastreuse = -1;
 			goto ok;

 		next_port:
 			spin_unlock(&head->lock);
 		} while (--remaining > 0);
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);

 		local_bh_enable();

 		return -EADDRNOTAVAIL;

ok:
 		/* All locks still held and bhs disabled */
 		tcp_port_rover = rover;
 		spin_unlock(&tcp_portalloc_lock);
        /* 加入bind哈希表 */
 		tcp_bind_hash(sk, tb, rover);
		if (sk_unhashed(sk)) {
            /* 设置该sk的源端口 */
 			inet_sk(sk)->sport = htons(rover);
 			__tcp_v4_hash(sk, 0);
 		}
 		spin_unlock(&head->lock);

 		if (tw) {
            /* 将bind哈希表和established哈希表中的该tw sock删除 */
 			tcp_tw_deschedule(tw);
 			tcp_tw_put(tw);
 		}

		ret = 0;
		goto out;
 	}
    /* 如果绑定了端口 */
 	head  = &tcp_bhash[tcp_bhashfn(snum)];
 	tb  = tcp_sk(sk)->bind_hash;
	spin_lock_bh(&head->lock);
	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
        /* 绑定该端口的连接只有当前这一个,肯定不会冲突,直接使用 */
		__tcp_v4_hash(sk, 0);
		spin_unlock_bh(&head->lock);
		return 0;
	} else {
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
        /* 检查是否和established表冲突 */
		ret = __tcp_v4_check_established(sk, snum, NULL);
out:
		local_bh_enable();
		return ret;
	}
}
绑定端口的时候就和bind()系统调用相似,要考虑系统选择的端口是否会有冲突,这其中涉及几条链表的冲突判断。第一条自然是bind哈希表,然后是established哈希表,在后续的内核版本中将time_wait独立为一个哈希表,因此也要检查这个time_wait哈希表。当前版本内核仍然是一个链表。在这就是通过__tcp_v4_check_established来判断是否和established或者time_wait连接冲突的。

6、__tcp_v4_check_established函数

static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
				      struct tcp_tw_bucket **twp)
{
	struct inet_opt *inet = inet_sk(sk);
	u32 daddr = inet->rcv_saddr;
	u32 saddr = inet->daddr;
	int dif = sk->sk_bound_dev_if;
	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
    /* 根据四元组获取established哈希表 */
	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
	struct sock *sk2;
	struct hlist_node *node;
	struct tcp_tw_bucket *tw;

	write_lock(&head->lock);

	/* Check TIME-WAIT sockets first. */
	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
		tw = (struct tcp_tw_bucket *)sk2;
        /* socket四元组信息等是否相同 */
		if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
			struct tcp_opt *tp = tcp_sk(sk);

			/* With PAWS, it is safe from the viewpoint
			   of data integrity. Even without PAWS it
			   is safe provided sequence spaces do not
			   overlap i.e. at data rates <= 80Mbit/sec.

			   Actually, the idea is close to VJ's one,
			   only timestamp cache is held not per host,
			   but per port pair and TW bucket is used
			   as state holder.

			   If TW bucket has been already destroyed we
			   fall back to VJ's scheme and use initial
			   timestamp retrieved from peer table.
			 */
			/* 如果各信息相同,考虑time_wait的复用 */
			if (tw->tw_ts_recent_stamp &&
			    (!twp || (sysctl_tcp_tw_reuse &&
				      xtime.tv_sec -
				      tw->tw_ts_recent_stamp > 1))) {
				/* 检查time_wait状态的连接是否可以重用 */
				if ((tp->write_seq =
						tw->tw_snd_nxt + 65535 + 2) == 0)
					tp->write_seq = 1;
                /* 不冲突,复用该连接的一些ip选项信息 */
				tp->ts_recent	    = tw->tw_ts_recent;
				tp->ts_recent_stamp = tw->tw_ts_recent_stamp;
				sock_hold(sk2);
				goto unique;
			} else
				goto not_unique;
		}
	}
	tw = NULL;

	/* And established part... */
	sk_for_each(sk2, node, &head->chain) {
		if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
			goto not_unique;
	}

unique:
	/* Must record num and sport now. Otherwise we will see
	 * in hash table socket with a funny identity. */
	/* 设置绑定端口号和源端口 */
	inet->num = lport;
	inet->sport = htons(lport);
	sk->sk_hashent = hash;
	BUG_TRAP(sk_unhashed(sk));
	__sk_add_node(sk, &head->chain);
    /* 加入到established表中 */
	sock_prot_inc_use(sk->sk_prot);
	write_unlock(&head->lock);

	if (twp) {
		*twp = tw;
		NET_INC_STATS_BH(TimeWaitRecycled);
	} else if (tw) {
		/* Silly. Should hash-dance instead... */
		tcp_tw_deschedule(tw);
		NET_INC_STATS_BH(TimeWaitRecycled);

		tcp_tw_put(tw);
	}

	return 0;

not_unique:
	write_unlock(&head->lock);
	return -EADDRNOTAVAIL;
}
在完成ip和端口的绑定之后,再回到tcp_v4_connect()函数中。接下来就是发送SYN报文,也就是调用tcp_connect函数来进行发送了。

7、tcp_connect函数

int tcp_connect(struct sock *sk)
{
	struct tcp_opt *tp = tcp_sk(sk);
	struct sk_buff *buff;
    /* 进行sock参数的初始化 */
	tcp_connect_init(sk);
    /* 分配skb结构体 */
	buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
	if (unlikely(buff == NULL))
		return -ENOBUFS;

	/* Reserve space for headers. */
	skb_reserve(buff, MAX_TCP_HEADER);
    /* 初始化skb */
	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
	TCP_ECN_send_syn(sk, tp, buff);
	TCP_SKB_CB(buff)->sacked = 0;
	buff->csum = 0;
	TCP_SKB_CB(buff)->seq = tp->write_seq++;
	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
	tp->snd_nxt = tp->write_seq;
	tp->pushed_seq = tp->write_seq;

	/* Send it off. */
    /* 发包时间 */
	TCP_SKB_CB(buff)->when = tcp_time_stamp;
	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
	__skb_queue_tail(&sk->sk_write_queue, buff);
	tcp_charge_skb(sk, buff);
	tp->packets_out++;
    /* 发送syn包 */
	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
	TCP_INC_STATS(TcpActiveOpens);

	/* Timer for repeating the SYN until an answer. */
    /* 开启重传定时器 */
	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
	return 0;
}
参数初始化完成之后,调用skb_clone分配一个buff来装在syn报文,然后调用tcp_transmit_skb函数来发送syn报文。

8、tcp_transmit_skb函数

int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
	if(skb != NULL) {
		struct inet_opt *inet = inet_sk(sk);
		struct tcp_opt *tp = tcp_sk(sk);
		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
		int tcp_header_size = tp->tcp_header_len;
		struct tcphdr *th;
		int sysctl_flags;
		int err;

#define SYSCTL_FLAG_TSTAMPS	0x1
#define SYSCTL_FLAG_WSCALE	0x2
#define SYSCTL_FLAG_SACK	0x4

		sysctl_flags = 0;
		if (tcb->flags & TCPCB_FLAG_SYN) {
            /* 为syn报文构建tcp选项 */
			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
			if(sysctl_tcp_timestamps) {
				tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
				sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
			}
			if(sysctl_tcp_window_scaling) {
				tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
				sysctl_flags |= SYSCTL_FLAG_WSCALE;
			}
			if(sysctl_tcp_sack) {
				sysctl_flags |= SYSCTL_FLAG_SACK;
				if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
					tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
			}
		} else if (tp->eff_sacks) {
			/* A SACK is 2 pad bytes, a 2 byte header, plus
			 * 2 32-bit sequence numbers for each SACK block.
			 */
			 /* 为已建立连接的socket构建tcp选项 */
			tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
					    (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));
		}
		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
		skb->h.th = th;
		skb_set_owner_w(skb, sk);

		/* Build TCP header and checksum it. */
        /* 构建tcp头和校验和 */
        /* 源端口、目的端口、序列号、ack号 */
		th->source		= inet->sport;
		th->dest		= inet->dport;
		th->seq			= htonl(tcb->seq);
		th->ack_seq		= htonl(tp->rcv_nxt);
		*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | tcb->flags);
		if (tcb->flags & TCPCB_FLAG_SYN) {
			/* RFC1323: The window in SYN & SYN/ACK segments
			 * is never scaled.
			 */
			th->window	= htons(tp->rcv_wnd);
		} else {
			th->window	= htons(tcp_select_window(sk));
		}
        /* 校验和、紧急指针 */
		th->check		= 0;
		th->urg_ptr		= 0;
        
		if (tp->urg_mode &&
		    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
			th->urg_ptr		= htons(tp->snd_up-tcb->seq);
			th->urg			= 1;
		}
        /* tcp选项 */
		if (tcb->flags & TCPCB_FLAG_SYN) {
			tcp_syn_build_options((__u32 *)(th + 1),
					      tcp_advertise_mss(sk),
					      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
					      (sysctl_flags & SYSCTL_FLAG_SACK),
					      (sysctl_flags & SYSCTL_FLAG_WSCALE),
					      tp->rcv_wscale,
					      tcb->when,
		      			      tp->ts_recent);
		} else {
			tcp_build_and_update_options((__u32 *)(th + 1),
						     tp, tcb->when);

			TCP_ECN_send(sk, tp, skb, tcp_header_size);
		}
        
		tp->af_specific->send_check(sk, th, skb->len, skb);
        /* 判断是否开启ACK延迟定时器 */
		if (tcb->flags & TCPCB_FLAG_ACK)
			tcp_event_ack_sent(sk);

		if (skb->len != tcp_header_size)
			tcp_event_data_sent(tp, skb, sk);

		TCP_INC_STATS(TcpOutSegs);
        /* 发往ip层处理了,调用ip_queue_xmit() */
		err = tp->af_specific->queue_xmit(skb, 0);
		if (err <= 0)
			return err;

		tcp_enter_cwr(tp);

		/* NET_XMIT_CN is special. It does not guarantee,
		 * that this packet is lost. It tells that device
		 * is about to start to drop packets or already
		 * drops some packets of the same priority and
		 * invokes us to send less aggressively.
		 */
		return err == NET_XMIT_CN ? 0 : err;
	}
	return -ENOBUFS;
#undef SYSCTL_FLAG_TSTAMPS
#undef SYSCTL_FLAG_WSCALE
#undef SYSCTL_FLAG_SACK
}
//tcp_transmit_skb函数主要是构造TCP的头部以及TCP相关选项等,这样TCP层的处理就完成了,调用tp->af_specific->queue_xmit,发往ip层继续处理。tp->af_specific指向ipv4_specific。在tcp_v4_init_sock初始化函数中将tp->af_specific绑定到ip_queue_xmit函数。ipv4_specific结构如下:
struct tcp_func ipv4_specific = {
	.queue_xmit	=	ip_queue_xmit,
	.send_check	=	tcp_v4_send_check,
	.rebuild_header	=	tcp_v4_rebuild_header,
	.conn_request	=	tcp_v4_conn_request,
	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
	.remember_stamp	=	tcp_v4_remember_stamp,
	.net_header_len	=	sizeof(struct iphdr),
	.setsockopt	=	ip_setsockopt,
	.getsockopt	=	ip_getsockopt,
	.addr2sockaddr	=	v4_addr2sockaddr,
	.sockaddr_len	=	sizeof(struct sockaddr_in),
};
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章