9.3.1 Why
TCP在發送SYN、FIN以及數據包時爲了保證可靠傳輸,會先將它們放入發送隊列再發送副本到網絡中,一旦發現數據丟失(比如連續收到多個ack_seq號相同的ACK)則重傳發送隊列中的skb。如果丟失發現機制失效了呢(比如ACK丟失),這時就需要重傳定時器在指定的時間內重傳數據,否則數據傳輸就可能會阻塞。
9.3.2 When
設置重傳定時器的時機有:
(1)調用tcp_check_sack_reneging處理虛假SACK事件時:
1906 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1907 {
1908 if (flag & FLAG_SACK_RENEGING) {
1909 struct inet_connection_sock *icsk = inet_csk(sk);
1910 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1911
1912 tcp_enter_loss(sk, 1);
1913 icsk->icsk_retransmits++;
1914 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
1915 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1916 icsk->icsk_rto, TCP_RTO_MAX);
1917 return true;
1918 }
1919 return false;
1920 }
虛假SACK是指:最新收到的ACK的ack_seq指向已記錄的SACK塊,這說明記錄的SACK並沒有反應接收方的真實的狀態,也就是說接收方現在已經處於嚴重擁塞的狀態或者在處理上有bug,從而刪除了亂序隊列中的數據(這些數據之前是在SACK選項中發送過來的)。因爲按照正常的邏輯流程,接收的ACK不應該指向已記錄的SACK,而應該指向SACK後面未接收的地方(因爲被SACK的報文是已經放入接收方的亂序隊列中,如果收到缺失的段正常情況下會與亂序報文一起交付接收隊列,從而使ack_seq指向被SACK的報文的後面)。所以接下來就按照超時重傳的方式去處理。
(2)調用tcp_rearm_rto更新RTO時:
2926 void tcp_rearm_rto(struct sock *sk)
2927 {
2928 const struct inet_connection_sock *icsk = inet_csk(sk);
2929 struct tcp_sock *tp = tcp_sk(sk);
2930
2931 /* If the retrans timer is currently being used by Fast Open
2932 * for SYN-ACK retrans purpose, stay put.
2933 */
2934 if (tp->fastopen_rsk)
2935 return;
2936
2937 if (!tp->packets_out) { //網絡中沒有已發送的數據
2938 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
2939 } else {
2940 u32 rto = inet_csk(sk)->icsk_rto;
2941 /* Offset the time elapsed after installing regular RTO */
2942 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2943 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2944 struct sk_buff *skb = tcp_write_queue_head(sk);
2945 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
2946 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
2947 /* delta may not be positive if the socket is locked
2948 * when the retrans timer fires and is rescheduled.
2949 */
2950 if (delta > 0)
2951 rto = delta;
2952 }
2953 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
2954 TCP_RTO_MAX);
2955 }
2956 }
而調用tcp_rearm_rto並安裝重傳定時器的常見條件有:
1)收到ACK並確認掉數據時且仍然有未確認的數據時:
3001 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3002 u32 prior_snd_una)
3003 {
...
3095 if (flag & FLAG_ACKED) {
...
3105 tcp_rearm_rto(sk);
2)收到合法ACK並安裝了ER定時器或丟失探測定時器時:3325 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3326 {
...
3358 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3359 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3360 tcp_rearm_rto(sk);
...
3)發送了新數據並調用tcp_event_new_data_sent函數時: 72 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
73 {
74 struct inet_connection_sock *icsk = inet_csk(sk);
75 struct tcp_sock *tp = tcp_sk(sk);
76 unsigned int prior_packets = tp->packets_out;
77
78 tcp_advance_send_head(sk, skb);
79 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
80
81 tp->packets_out += tcp_skb_pcount(skb);
82 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
83 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
84 tcp_rearm_rto(sk);
85 }
86 }
(3)發送SYN後:
2925 int tcp_connect(struct sock *sk)
2926 {
...
2947 tcp_connect_queue_skb(sk, buff);
...
2963 /* Timer for repeating the SYN until an answer. */
2964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2965 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
(4)開啓TFO功能的情況下,收到SYN併發送SYN|ACK後,需要設置重傳定時器以便重傳SYN|ACK:1369 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1370 struct sk_buff *skb,
1371 struct sk_buff *skb_synack,
1372 struct request_sock *req)
1373 {
...
1421 /* Activate the retrans timer so that SYNACK can be retransmitted.
1422 * The request socket is not added to the SYN table of the parent
1423 * because it's been added to the accept queue directly.
1424 */
1425 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1426 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
...
(5)開啓TFO功能的情況下,收到ICMP目的不可達報文時:
326 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
327 {
...
399 case ICMP_DEST_UNREACH:
...
440 skb = tcp_write_queue_head(sk);
441 BUG_ON(!skb);
442
443 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
444 tcp_time_stamp - TCP_SKB_CB(skb)->when);
445
446 if (remaining) { //從skb發送出去到現在經歷的事件比RTO短
447 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448 remaining, TCP_RTO_MAX);
(6)定時器超時的時候可能會設置重傳定時器;這種情況暫不分析
清除重傳定時器的時機爲:
(1)調用tcp_rearm_rto且所有發送數據都已經被收到時;
調用tcp_rearm_rto並清除重傳定時器的常見情況有:
1)開啓TFO的情況下,發送SYN|ACK後收到ACK時:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
...
5682 case TCP_SYN_RECV:
5683 if (acceptable) {
...
5735 tcp_rearm_rto(sk);
...
2)開啓TFO的情況下,在TCP_FIN_WAIT1狀態下收到ACK但TFO socket仍然存在時:5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
...
5751 case TCP_FIN_WAIT1:
...
5757 if (req != NULL) {
...
5767 reqsk_fastopen_remove(sk, req, false);
5768 tcp_rearm_rto(sk);
3)收到ACK並確認掉全部數據時
4)收到合法ACK並安裝了ER定時器或丟失探測定時器時
(2)安裝丟失探測定時器、ER定時器、堅持定時器時;由於這3個定時器與重傳定時器使用同一個數據結構,安裝一個就等於拆除了其它類型的定時器。
重傳定時器的超時時間是RTO(Retransmission TimeOut)時間,這個時間是從發出數據包到第一次重傳開始的時間;它由擁塞控制算法計算的,是不斷變化的。
9.3.3 What
重傳定時器所使用的icsk->icsk_retransmit_timer安裝的超時函數是tcp_write_timer:
478 void tcp_write_timer_handler(struct sock *sk)
479 {
480 struct inet_connection_sock *icsk = inet_csk(sk);
481 int event;
482
483 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) //TCP狀態是CLOSE或未安裝定時器
484 goto out;
485
486 if (time_after(icsk->icsk_timeout, jiffies)) { //尚未超時
487 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
488 goto out;
489 }
490
491 event = icsk->icsk_pending;
492
493 switch (event) {
494 case ICSK_TIME_EARLY_RETRANS: //ER
495 tcp_resume_early_retransmit(sk);
496 break;
497 case ICSK_TIME_LOSS_PROBE://正常重傳&探測報文重傳
498 tcp_send_loss_probe(sk);
499 break;
500 case ICSK_TIME_RETRANS://正常重傳
501 icsk->icsk_pending = 0;
502 tcp_retransmit_timer(sk);
503 break;
504 case ICSK_TIME_PROBE0: //堅持定時器超時
505 icsk->icsk_pending = 0;
506 tcp_probe_timer(sk);
507 break;
508 }
509
510 out:
511 sk_mem_reclaim(sk); //回收緩存
512 }
513
514 static void tcp_write_timer(unsigned long data)
515 {
516 struct sock *sk = (struct sock *)data;
517
518 bh_lock_sock(sk);
519 if (!sock_owned_by_user(sk)) {
520 tcp_write_timer_handler(sk);
521 } else {
522 /* deleguate our work to tcp_release_cb() */
523 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
524 sock_hold(sk);
525 }
526 bh_unlock_sock(sk);
527 sock_put(sk);
528 }
523:如果icsk->icsk_retransmit_timer超時時socket被應用進程鎖定,則設置TCP_WRITE_TIMER_DEFERRED標記,這樣在應用進程釋放socket時會調用tcp_release_cb函數:
741 void tcp_release_cb(struct sock *sk)
742 {
...
757 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
758 tcp_write_timer_handler(sk);
759 __sock_put(sk);
760 }
...
這樣看來重傳定時器真正的超時函數是tcp_retransmit_timer:
340 void tcp_retransmit_timer(struct sock *sk)
341 {
342 struct tcp_sock *tp = tcp_sk(sk);
343 struct inet_connection_sock *icsk = inet_csk(sk);
344
345 if (tp->fastopen_rsk) { //開啓了TFO功能
346 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
347 sk->sk_state != TCP_FIN_WAIT1);
348 tcp_fastopen_synack_timer(sk); //重傳SYN|ACK
349 /* Before we receive ACK to our SYN-ACK don't retransmit
350 * anything else (e.g., data or FIN segments).
351 */
352 return;
353 }
354 if (!tp->packets_out) //包已經被全部確認
355 goto out;
356
357 WARN_ON(tcp_write_queue_empty(sk));
358
359 tp->tlp_high_seq = 0;
360
361 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && //發送窗口關閉且socket並非orphan socket
362 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { //並非連接建立狀態
363 /* Receiver dastardly shrinks window. Our retransmits
364 * become zero probes, but we should not timeout this
365 * connection. If the socket is an orphan, time it out,
366 * we cannot allow such beasts to hang infinitely.
367 */
368 struct inet_sock *inet = inet_sk(sk);
369 if (sk->sk_family == AF_INET) {
370 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
371 &inet->inet_daddr,
372 ntohs(inet->inet_dport), inet->inet_num,
373 tp->snd_una, tp->snd_nxt);
374 }
375 #if IS_ENABLED(CONFIG_IPV6)
376 else if (sk->sk_family == AF_INET6) {
377 struct ipv6_pinfo *np = inet6_sk(sk);
378 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
379 &np->daddr,
380 ntohs(inet->inet_dport), inet->inet_num,
381 tp->snd_una, tp->snd_nxt);
382 }
383 #endif
384 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { //超過TCP_RTO_MAX的時間沒有收到對端的確認
385 tcp_write_err(sk); //報告錯誤並關閉連接
386 goto out;
387 }
388 tcp_enter_loss(sk, 0); //進入擁塞控制的LOSS狀態
389 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); //重傳發送隊列中的首包
390 __sk_dst_reset(sk);
391 goto out_reset_timer;
392 }
393 //發送窗口非0
394 if (tcp_write_timeout(sk)) //重傳等待時間過長或orphan socket消耗資源過多
395 goto out;
396
397 if (icsk->icsk_retransmits == 0) { //第一次重傳
... //更新MIB數據庫信息,用於網絡管理
417 }
418
419 tcp_enter_loss(sk, 0); //進入擁塞控制的LOSS狀態
420
421 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {//重傳發送隊列中的首包失敗
422 /* Retransmission failed because of local congestion,
423 * do not backoff.
424 */
425 if (!icsk->icsk_retransmits)
426 icsk->icsk_retransmits = 1;
427 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
428 min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
429 TCP_RTO_MAX);//重設重傳定時器
430 goto out;
431 }
...
448 icsk->icsk_backoff++;
449 icsk->icsk_retransmits++;
450
451 out_reset_timer:
452 /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
453 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
454 * might be increased if the stream oscillates between thin and thick,
455 * thus the old value might already be too high compared to the value
456 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
457 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
458 * exponential backoff behaviour to avoid continue hammering
459 * linear-timeout retransmissions into a black hole
460 */
461 if (sk->sk_state == TCP_ESTABLISHED &&
462 (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
463 tcp_stream_is_thin(tp) &&
464 icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
465 icsk->icsk_backoff = 0;
466 icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
467 } else {
468 /* Use normal (exponential) backoff */
469 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); //增加超時時間
470 }
471 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); //重設重傳定時器
472 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
473 __sk_dst_reset(sk);
474
475 out:;
476 }
tcp_write_timeout函數判斷是否應該超時: 156 static int tcp_write_timeout(struct sock *sk)
157 {
158 struct inet_connection_sock *icsk = inet_csk(sk);
159 int retry_until;
160 bool do_reset, syn_set = false;
161
162 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
163 if (icsk->icsk_retransmits)
164 dst_negative_advice(sk);
165 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
166 syn_set = true;
167 } else {
168 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { //這裏超時可能是因爲MTU過大
169 /* Black hole detection */
170 tcp_mtu_probing(icsk, sk); //執行路徑MTU探測
171
172 dst_negative_advice(sk);
173 }
174
175 retry_until = sysctl_tcp_retries2;
176 if (sock_flag(sk, SOCK_DEAD)) { //當前socket是orphan socket
177 const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
178
179 retry_until = tcp_orphan_retries(sk, alive);
180 do_reset = alive ||
181 !retransmits_timed_out(sk, retry_until, 0, 0);
182
183 if (tcp_out_of_resources(sk, do_reset))//當前orphan socket數量太多
184 return 1;
185 }
186 }
187
188 if (retransmits_timed_out(sk, retry_until,
189 syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { //如果超時
190 /* Has it gone just too far? */
191 tcp_write_err(sk); //關閉連接
192 return 1;
193 }
194 return 0;
195 }
retransmits_timed_out函數判斷是否超時:127 static bool retransmits_timed_out(struct sock *sk,
128 unsigned int boundary,
129 unsigned int timeout,
130 bool syn_set)
131 {
132 unsigned int linear_backoff_thresh, start_ts;
133 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
134
135 if (!inet_csk(sk)->icsk_retransmits) //沒有重傳
136 return false;
137
138 if (unlikely(!tcp_sk(sk)->retrans_stamp))
139 start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; //使用skb發送時間作爲起始時間
140 else
141 start_ts = tcp_sk(sk)->retrans_stamp; //使用重傳時間作爲起始時間
142
143 if (likely(timeout == 0)) {
144 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
145
146 if (boundary <= linear_backoff_thresh)
147 timeout = ((2 << boundary) - 1) * rto_base;
148 else
149 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
150 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
151 }
152 return (tcp_time_stamp - start_ts) >= timeout;
153 }
判斷應該超時時使用tcp_write_err函數關閉本端連接並嚮應用進程報告錯誤: 35 static void tcp_write_err(struct sock *sk)
36 {
37 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
38 sk->sk_error_report(sk);
39
40 tcp_done(sk);
41 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
42 }
tcp_retransmit_skb函數用於重傳skb:
2374 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2375 {
2376 struct tcp_sock *tp = tcp_sk(sk);
2377 int err = __tcp_retransmit_skb(sk, skb);
2378
2379 if (err == 0) {
2380 /* Update global TCP statistics. */
2381 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
2382
2383 tp->total_retrans++;
2384
2385 #if FASTRETRANS_DEBUG > 0
2386 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2387 net_dbg_ratelimited("retrans_out leaked\n");
2388 }
2389 #endif
2390 if (!tp->retrans_out)
2391 tp->lost_retrans_low = tp->snd_nxt;
2392 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; //記錄skb被重傳過
2393 tp->retrans_out += tcp_skb_pcount(skb);
2394
2395 /* Save stamp of the first retransmit. */
2396 if (!tp->retrans_stamp)
2397 tp->retrans_stamp = TCP_SKB_CB(skb)->when; //記錄重傳時間
2398
2399 tp->undo_retrans += tcp_skb_pcount(skb);
2400
2401 /* snd_nxt is stored to detect loss of retransmitted segment,
2402 * see tcp_input.c tcp_sacktag_write_queue().
2403 */
2404 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2405 }
2406 return err;
2407 }
綜上,TCP重傳定時器的基本功能是:如果有TFO socket則直接重傳SYN|ACK,然後返回;如果沒有,檢查重傳是否經過了太長的時間,若是則關閉連接並報告錯誤;否則重傳發送隊列中的首包,並將重傳定時器設置爲更長的超時時間。