tcp optimize



 
tcp_sack :
tcp_sack - BOOLEAN Enable select acknowledgments (SACKS). 1
減小重複包。


/*This is what the send packet queuing engine uses to pass
 * TCP per-packet control information to the transmission code.*/      

 

 

 

 

 

struct tcp_skb_cb {
	union {
		struct inet_skb_parm	h4; /**/
	} header;	/* For incoming frames		*/
	__u32		seq;		/* Starting sequence number	*/
	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
	__u32		when;		/* used to compute rtt's	*/
	__u8		flags;		/* TCP header flags.		*/
	__u8		sacked;		/* State flags for SACK/FACK.	*/
	__u32		ack_seq;	/* Sequence number ACK'd	*/
};

 

 

To access IP options from IPCB, we need to access opt field of struct  inet_skb_parm 
The Opt field is embedded type   ip_options   in    struct
inet_skb_parm 

 

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb){
 if (sk->sk_state == TCP_LISTEN) {
   struct sock *nsk = tcp_v4_hnd_req(sk, skb); }

}

Tcp_v4_hnd_req() looks for any connection request in the SYN queue of the listening socket

 


Create a new socket for this connection and return the pointer to the new socket
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb){
 struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
             iph->saddr, iph->daddr);
 if (req)
  return tcp_check_req(sk, skb, req, prev);

}

 

 

 

 

 

struct inet_hashinfo {
	/* This is for sockets with full identity only.  Sockets here will
	 * always be without wildcards and will have the following invariant:
	 *
	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
	 *
	 * TIME_WAIT sockets use a separate chain (twchain).
	 */
	struct inet_ehash_bucket	*ehash;
	spinlock_t			*ehash_locks;
	unsigned int			ehash_mask;
	unsigned int			ehash_locks_mask;
	/* Ok, let's try this, I give up, we do need a local binding
	 * TCP hash as well as the others for fast bind/connect.
	 */
	struct inet_bind_hashbucket	*bhash;
	unsigned int			bhash_size;
	/* 4 bytes hole on 64 bit */

	struct kmem_cache		*bind_bucket_cachep;

	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 *
	 * Now align to a new cache line as all the following members
	 * might be often dirty.
	 */
	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * is just local port number.
	 */
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
					____cacheline_aligned_in_smp;

	atomic_t			bsockets;

 

 

 

 

 

struct inet_bind_bucket {
#ifdef CONFIG_NET_NS
 struct net  *ib_net;
#endif
 unsigned short  port;
 signed short  fastreuse;
 int   num_owners;
 struct hlist_node node;
 struct hlist_head owners;
};
------------------------------------
sock.h
   Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made
 *                 protinfo be just a void pointer, as the
 *                 protocol specific parts were moved to
 *                 respective headers and ipv4/v6, etc now
 *                 use private slabcaches for its socks


/*
 * The three way handshake has completed - we got a valid synack -
 * now create the new socket.
 */
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
      struct request_sock *req,
      struct dst_entry *dst)
 本地端口複用:

Socket 綁定在不同的設備上共享一個 TCP端口

 sk->sk_reuse 端口複用標記被設置, 而且沒有處在 TCP_LISTEN狀態的 ,端口可以共享
如ftp 的數據端口

Socket綁定到一個 多個特別的 本地地址上inet_sk(sk)->rcv_saddr,他們可以共享


tcp_sack - BOOLEAN Enable select acknowledgments (SACKS). 1
減小重複包。


擁塞避免算法 和慢啓動算法,需要對每個連接維持兩個變量;

一個擁塞窗口cwnd  和一個慢啓動門限

 

 

tcp_abc

tcp_abc - INTEGER Controls Appropriate Byte Count (ABC) defined in RFC3465.
 ABC is a way of increasing congestion window (cwnd) more slowly
 in response to partial acknowledgments.
 Possible values are:
 0 increase cwnd once per acknowledgment (no ABC)
1 increase cwnd once per acknowledgment of full sized segment
2 allow increase cwnd by two if acknowledgment is of two segments to compensate for delayed acknowledgments. Default: 0 (off)

/*
 * Slow start is used when congestion window is less than slow start
 * threshold. This version implements the basic RFC2581 version
 * and optionally supports:
 *  RFC3742 Limited Slow Start     - growth limited to max_ssthresh
 * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
 */

void tcp_slow_start(struct tcp_sock *tp)
{
 int cnt; /* 這次要增加的量 */

 /* RFC3465: ABC Slow start
  * Increase only after a full MSS of bytes is acked
  *
  * TCP sender SHOULD increase cwnd by the number of
  * previously unacknowledged bytes ACKed by each incoming
  * acknowledgment, provided the increase is not more than L
  */
/*根據 rfc 3645 還沒有收滿一個 MSS就不增長*/
if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
  return;
/*擁塞窗口大於了設置的max_ssthresh */
 if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
  cnt = sysctl_tcp_max_ssthresh >> 1; /* 受限情況下爲最大的一半limited slow start */
 else
  cnt = tp->snd_cwnd;   /* 正常情況每次翻倍exponential increase */

 /* RFC3465: ABC 對方啓用延遲ACK 
  * We MAY increase by 2 if discovered delayed ack
  */
/*開始根據rfc 3465來增加窗口,如果發現對方啓用了延遲ack 就直接增加一倍*/
 if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
  cnt <<= 1;
 tp->bytes_acked = 0;
/*加到原來的基礎上*/
 tp->snd_cwnd_cnt += cnt;
/*迫使cwnd窗口不能大於snd_cwnd_clamp*/
 while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
  tp->snd_cwnd_cnt -= tp->snd_cwnd;
  if (tp->snd_cwnd < tp->snd_cwnd_clamp)
   tp->snd_cwnd++;
 }
}

 
snd_ssthresh  Slow start threshold. We are in slow start if snd_cwnd is less than this.
snd_cwnd_cnt  A counter used to slow down the rate of increase once we exceed slow start threshold. snd_cwnd_clamp  This is the maximum size that snd_cwnd can grow to.

RFC3742 :

Slow-Start  cwnd <= max_ssthresh(RFC 2581)

 cwnd 增長根據one MSS per ACK

Limited Slow-Start  Max_ssthresh < cwnd <= ssthresh
 
 Max_ssthresh/2 MSS per RRT

tcp_max_ssthresh
SACK
 新的sack選項  第一個快需要包含最近接受到的段 ,重複先前發送過額外的SACK塊 爲了增加對丟失ACK 的表現。

tcp_fack - BOOLEAN Enable FACK congestion avoidance and fast retransmission.
 The value is not used, if tcp_sack is not enabled.

接收方:
“收到失序報文段”, 發送方TCP立刻需要產生 重複ACK
: 不應該被延遲
:讓對方知道收到一個失序的報文段
:告訴對方自己想收到的序號

“收到重複的ACK”
如果是一些報文段重排序 ,等待少量的ACK帶來。 1~2個正常

3個以上,說明可能是報文段丟失了 。  啓動快速重傳算法, 無需等待重傳定時器(快速恢復算法)、、
快速恢復算法(如果沒有ack超時就不進入滿啓動)  因爲這樣會突然減少數據流
Fast Revovery
 通過計算重複ACK 數目,來檢查有多少數據還在網絡中
他自動的增加cwnd 每次收到重複的ACK後, 爲了讓數據能充分傳輸
 它允許在減半的窗口大小下來快速重傳數據。


當收到3個以上重複的ack 將ssthresh 設置爲當前 cwnd 一半
並且 cwnd 爲ssthresh加上3*MSS
每次收到一個重複ack ,cwnd 增加一個MSS並且發送一個分組
當下一個確認新的數據 ack 到了 cwnd= ssthresh 這樣可以達到擁塞避免(因爲擁塞看起來已恢復了)

 

 

 

 

 

Congestion avoidance: As long as non-duplicate ACKs are received, the congestion window is additively increased by one MSS every round trip time. When a packet is lost, the likelihood of duplicate ACKs being received is very high (it's possible though unlikely that the stream just underwent extreme packet reordering, which would also prompt duplicate ACKs). The behavior of Tahoe and Reno differ in how they detect and react to packet loss:
Tahoe: Triple duplicate ACKS are treated the same as a timeout. Tahoe will perform "fast retransmit", reduce congestion window to 1 MSS, and reset to slow-start state.[8]
把三個重複acks當成超時 ,卻表現快速重傳,減少 cw 到一個MSS 並且 重置 s s狀態

Reno: If three duplicate ACKs are received (i.e., four ACKs acknowledging the same packet, which are not piggybacked on data, and do not change the receiver's advertised window), Reno will halve the congestion window, perform a fast retransmit, and enter a phase called Fast Recovery. If an ACK times out, slow start is used as it is with Tahoe.[8]

將擁塞窗口減半, 表現快速重傳 , 並且進入快速回復 。如果ack超時 同樣重置進入 s s 狀態


static int tcp_time_to_recover(struct sock *sk)
{
 struct tcp_sock *tp = tcp_sk(sk);
 __u32 packets_out;

 /* Do not perform any recovery during F-RTO algorithm */
 if (tp->frto_counter)
  return 0;

 /* Trick#1: The loss is proven. */
 if (tp->lost_out)
  return 1;

 /* Not-A-Trick#2 : Classic rule... */
 if (tcp_dupack_heuristics(tp) > tp->reordering)
  return 1;

 /* Trick#3 : when we use RFC2988 timer restart, fast
  * retransmit can be triggered by timeout of queue head.
  */
 if (tcp_is_fack(tp) && tcp_head_timedout(sk))
  return 1;

 /* Trick#4: It is still not OK... But will it be useful to delay
  * recovery more?
  */
 packets_out = tp->packets_out;
 if (packets_out <= tp->reordering &&
     tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
     !tcp_may_send_now(sk)) {
  /* We have nothing to send. This connection is limited
   * either by receiver window or by application.
   */
  return 1;
 }

 /* If a thin stream is detected, retransmit after first
  * received dupack. Employ only if SACK is supported in order
  * to avoid possible corner-case series of spurious retransmissions
  * Use only if there are no unsent data.
  */
 if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
     tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
     tcp_is_sack(tp) && !tcp_send_head(sk))
  return 1;

 return 0;
}
u32 lost_out; /* Lost packets   */
u32 fackets_out; /* FACK'd packets   */
u32 sacked_out; /* SACK'd packets   */
left_out is number of segments left network, but not ACKed yet.
FACK :
 
簡單的啓發式算法

只要我們發現一些報文丟失了, 就認爲所有 沒有被 sack 確認範圍的包 都丟失了
也就是 lost_out = fackets_out - sacked_out  left_out = fackets_out
 *  FACK: It is the simplest heuristics. As soon as we decided
 *  that something is lost, we decide that _all_ not SACKed
 *  packets until the most forward SACK are lost. I.e.
 *  lost_out = fackets_out - sacked_out and left_out = fackets_out.
 *  It is absolutely correct estimate, if network does not reorder
 *  packets. And it loses any connection to reality when reordering
 *  takes place. We use FACK by default until reordering
 *  is suspected on the path to this destination.

 

Fack 算法 基於流控制原理,並且設計和 TCP SACK選項一起使用

在簡單的段丟失下, Reno體現了流量控制的理念。 但是在 成倍丟失的情況下, reno由於對網絡數據丟包數 缺少一個足夠統計數據  從而達不到它的效果。
Reno fails to meet the ideal principles because it lacks a sufficiently accurate estimate of the data outstanding in the network

需要的網絡狀態信息可以獲得 通過精確的確認receiver 保持的最被轉發數據

最被轉發數據,指的是正確接收的序號最大的數據 FACK算法的目標是表現一個精確的流量控制在 瀏覽回覆時期 通過對網絡上的數據數目保持一個精確的檢查
The goal of fack algorithm is to perform precise congestion control during recovery by keeping an accurate estimate of the amount of data
Outstanding in the network
爲了實現這個 Fack 試圖保存 TCP的 時鐘 並且減少總體上的突發性事件

Fack 使用 SACK選項提供額外的信息 用來保存一個 詳細的關於在網絡中飄的總數據量

Reno 和reno sack 都企圖統計這個值 ,通過假設 每一重複的ACK 接受表現一個在網絡中飄的段

而Fack 算法可以做到這個 直接通過引入兩個新的狀態變量

Snd.fack 和 retran_data . 同樣發送發必須維護關於接受方擁有的報文段的信息, 目地是爲了用sack信息去正確的重傳數據包

作爲fack 瀏覽控制算法的核心變量 snd.fack 。它用來更新 從而反映 接收方擁有的 最被轉發的數據

在非恢復狀態下  snd.fack 變量被更新 通過 TCP頭部中的確認號 = as.snd.una

最恢復模式中 發送方連續的更新 snd.una 根據確認的序號

 

擁塞避免

擁塞避免算法和滿啓動算法是兩個目的不同,獨立的算法。 但是發生擁塞時候 ,期望通過降低分組進入網絡的速率
可以通過結合慢啓動做到。

擁塞發生時(RTO; 3次ACK): ssthresh 被設置爲當前窗口大小的一半 min(cwnd, advice) 最小爲2個MSS, 此外如果是超時引起的話, cwnd 設置爲1個MSS

Cwnd< ssthresh 進行慢啓動
Cwnd> ssthresh 進入擁塞避免

 

 

 

 

tcp_retries1

sysctl_tcp_retries1

 

重傳的次數 , 之後需要檢查 是否中間路由出問題了

 

如果超過  就調用 negative_advice (dst_negativ_advice)

 

Ipv4: ipv4_negative_advice();

 

Sk_dst = null 

 

tcp_retries2

sysctl_tcp_retries2

 

重傳最多的次數 ,之後就需要放棄這個鏈接了。 默認15次,長達900多秒

通過指數退避原則 

 

 

 

Sysctl_tcp_syn_retries

 

重傳SYN段的最多次數, 之後就放棄此鏈接

 

 

 

Sysctl_tcp_orphan_retries

 

 

Kill an orphaned socket in two cases even when 

 

1 :Sysctl_tcp_max_orphans

2: tcp_memory_allocated> sysctl_tcp_mem

 

 

 

 

 

 

 

 

SYN-ACK  timer

 

When get a connection request adn there is no pending connection request in the listening socket's 

SYN queue to be processed .

 

Tcp_synq_added()    new connection moves from SYN queue to accept queue  3 h-h

 

tcp_synack_retries 

 

Number of times SYNACKs for a passive TCP connection attempt will

be retransmitted. Should not be higher than 255. Default value

is 5, which corresponds to ~180seconds.

 

 

 tcp_vegas_cong_avoid

              Enable TCP Vegas congestion avoidance algorithm.  

 發送方通過檢查帶寬來預知擁塞 從而調整cwnd改變發送速率。

TCP  Vegas should provide less packet loss, but it is not as aggressive as TCP Reno.

 

 

TCP_CORK

 

 

	case TCP_NODELAY:
		if (val) {
			/* TCP_NODELAY is weaker than TCP_CORK, so that
			 * this option on corked socket is remembered, but
			 * it is not activated until cork is cleared.
			 *
			 * However, when TCP_NODELAY is set we make
			 * an explicit push, which overrides even TCP_CORK
			 * for currently queued segments.
			 */
			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
 

 

wmem_max rmem_max 

 

 

  The maximum sizes for socket buffers declared via the  SO_SNDBUF  and  SO_RCVBUF  mechanisms  are  limited  by  the  values  in  the

       /proc/sys/net/core/rmem_max  and  /proc/sys/net/core/wmem_max  files.  Note that TCP actually allocates twice the size of the buffer

       requested in the setsockopt(2) call, and so a succeeding getsockopt(2) call will not return the same size of buffer as requested  in

       the  setsockopt(2)  call

 

 

代碼如下 

 

case SO_RCVBUF:
		/* Don't error on this BSD doesn't and if you think
		   about it this is right. Otherwise apps have to
		   play 'guess the biggest size' games. RCVBUF/SNDBUF
		   are treated in BSD as hints */

		if (val > sysctl_rmem_max)
			val = sysctl_rmem_max;
set_rcvbuf:
		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
		/*
		 * We double it on the way in to account for
		 * "struct sk_buff" etc. overhead.   Applications
		 * assume that the SO_RCVBUF setting they make will
		 * allow that much actual data to be received on that
		 * socket.
		 *
		 * Applications are unaware that "struct sk_buff" and
		 * other overheads allocate from the receive buffer
		 * during socket buffer allocation.
		 *
		 * And after considering the possible alternatives,
		 * returning the value we actually used in getsockopt
		 * is the most desirable behavior.
		 */
		if ((val * 2) < SOCK_MIN_RCVBUF)
			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
		else
			sk->sk_rcvbuf = val * 2;

 

 tcp_low_latency

 

  默認是保持較高的吞吐量, 如果設置爲1 則較低的延遲,包會被立刻處理

 

 

int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
		size_t len, int nonblock, int flags, int *addr_len)
{
/×這裏本來應該通過直接設置 用戶空間緩存接收的,但是如果設置了low_latency便失去了prequeue 的處理能力×/
		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
			/* Install new reader */
			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
				user_recv = current;
				tp->ucopy.task = user_recv;
				tp->ucopy.iov = msg->msg_iov;

 官方說 建議在Beowulf compute cluster 中設置。我感覺是不是在MPI 的並行計算環境中,需要設置這個。。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章