9.3.1 Why
TCP在发送SYN、FIN以及数据包时为了保证可靠传输,会先将它们放入发送队列再发送副本到网络中,一旦发现数据丢失(比如连续收到多个ack_seq号相同的ACK)则重传发送队列中的skb。如果丢失发现机制失效了呢(比如ACK丢失),这时就需要重传定时器在指定的时间内重传数据,否则数据传输就可能会阻塞。
9.3.2 When
设置重传定时器的时机有:
(1)调用tcp_check_sack_reneging处理虚假SACK事件时:
1906 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1907 {
1908 if (flag & FLAG_SACK_RENEGING) {
1909 struct inet_connection_sock *icsk = inet_csk(sk);
1910 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1911
1912 tcp_enter_loss(sk, 1);
1913 icsk->icsk_retransmits++;
1914 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
1915 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1916 icsk->icsk_rto, TCP_RTO_MAX);
1917 return true;
1918 }
1919 return false;
1920 }
虚假SACK是指:最新收到的ACK的ack_seq指向已记录的SACK块,这说明记录的SACK并没有反应接收方的真实的状态,也就是说接收方现在已经处于严重拥塞的状态或者在处理上有bug,从而删除了乱序队列中的数据(这些数据之前是在SACK选项中发送过来的)。因为按照正常的逻辑流程,接收的ACK不应该指向已记录的SACK,而应该指向SACK后面未接收的地方(因为被SACK的报文是已经放入接收方的乱序队列中,如果收到缺失的段正常情况下会与乱序报文一起交付接收队列,从而使ack_seq指向被SACK的报文的后面)。所以接下来就按照超时重传的方式去处理。
(2)调用tcp_rearm_rto更新RTO时:
2926 void tcp_rearm_rto(struct sock *sk)
2927 {
2928 const struct inet_connection_sock *icsk = inet_csk(sk);
2929 struct tcp_sock *tp = tcp_sk(sk);
2930
2931 /* If the retrans timer is currently being used by Fast Open
2932 * for SYN-ACK retrans purpose, stay put.
2933 */
2934 if (tp->fastopen_rsk)
2935 return;
2936
2937 if (!tp->packets_out) { //网络中没有已发送的数据
2938 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
2939 } else {
2940 u32 rto = inet_csk(sk)->icsk_rto;
2941 /* Offset the time elapsed after installing regular RTO */
2942 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2943 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2944 struct sk_buff *skb = tcp_write_queue_head(sk);
2945 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
2946 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
2947 /* delta may not be positive if the socket is locked
2948 * when the retrans timer fires and is rescheduled.
2949 */
2950 if (delta > 0)
2951 rto = delta;
2952 }
2953 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
2954 TCP_RTO_MAX);
2955 }
2956 }
而调用tcp_rearm_rto并安装重传定时器的常见条件有:
1)收到ACK并确认掉数据时且仍然有未确认的数据时:
3001 static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3002 u32 prior_snd_una)
3003 {
...
3095 if (flag & FLAG_ACKED) {
...
3105 tcp_rearm_rto(sk);
2)收到合法ACK并安装了ER定时器或丢失探测定时器时:3325 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3326 {
...
3358 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3359 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3360 tcp_rearm_rto(sk);
...
3)发送了新数据并调用tcp_event_new_data_sent函数时: 72 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
73 {
74 struct inet_connection_sock *icsk = inet_csk(sk);
75 struct tcp_sock *tp = tcp_sk(sk);
76 unsigned int prior_packets = tp->packets_out;
77
78 tcp_advance_send_head(sk, skb);
79 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
80
81 tp->packets_out += tcp_skb_pcount(skb);
82 if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
83 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
84 tcp_rearm_rto(sk);
85 }
86 }
(3)发送SYN后:
2925 int tcp_connect(struct sock *sk)
2926 {
...
2947 tcp_connect_queue_skb(sk, buff);
...
2963 /* Timer for repeating the SYN until an answer. */
2964 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2965 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
(4)开启TFO功能的情况下,收到SYN并发送SYN|ACK后,需要设置重传定时器以便重传SYN|ACK:1369 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1370 struct sk_buff *skb,
1371 struct sk_buff *skb_synack,
1372 struct request_sock *req)
1373 {
...
1421 /* Activate the retrans timer so that SYNACK can be retransmitted.
1422 * The request socket is not added to the SYN table of the parent
1423 * because it's been added to the accept queue directly.
1424 */
1425 inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1426 TCP_TIMEOUT_INIT, TCP_RTO_MAX);
...
(5)开启TFO功能的情况下,收到ICMP目的不可达报文时:
326 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
327 {
...
399 case ICMP_DEST_UNREACH:
...
440 skb = tcp_write_queue_head(sk);
441 BUG_ON(!skb);
442
443 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
444 tcp_time_stamp - TCP_SKB_CB(skb)->when);
445
446 if (remaining) { //从skb发送出去到现在经历的事件比RTO短
447 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448 remaining, TCP_RTO_MAX);
(6)定时器超时的时候可能会设置重传定时器;这种情况暂不分析
清除重传定时器的时机为:
(1)调用tcp_rearm_rto且所有发送数据都已经被收到时;
调用tcp_rearm_rto并清除重传定时器的常见情况有:
1)开启TFO的情况下,发送SYN|ACK后收到ACK时:
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
...
5682 case TCP_SYN_RECV:
5683 if (acceptable) {
...
5735 tcp_rearm_rto(sk);
...
2)开启TFO的情况下,在TCP_FIN_WAIT1状态下收到ACK但TFO socket仍然存在时:5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
...
5751 case TCP_FIN_WAIT1:
...
5757 if (req != NULL) {
...
5767 reqsk_fastopen_remove(sk, req, false);
5768 tcp_rearm_rto(sk);
3)收到ACK并确认掉全部数据时
4)收到合法ACK并安装了ER定时器或丢失探测定时器时
(2)安装丢失探测定时器、ER定时器、坚持定时器时;由于这3个定时器与重传定时器使用同一个数据结构,安装一个就等于拆除了其它类型的定时器。
重传定时器的超时时间是RTO(Retransmission TimeOut)时间,这个时间是从发出数据包到第一次重传开始的时间;它由拥塞控制算法计算的,是不断变化的。
9.3.3 What
重传定时器所使用的icsk->icsk_retransmit_timer安装的超时函数是tcp_write_timer:
478 void tcp_write_timer_handler(struct sock *sk)
479 {
480 struct inet_connection_sock *icsk = inet_csk(sk);
481 int event;
482
483 if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) //TCP状态是CLOSE或未安装定时器
484 goto out;
485
486 if (time_after(icsk->icsk_timeout, jiffies)) { //尚未超时
487 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
488 goto out;
489 }
490
491 event = icsk->icsk_pending;
492
493 switch (event) {
494 case ICSK_TIME_EARLY_RETRANS: //ER
495 tcp_resume_early_retransmit(sk);
496 break;
497 case ICSK_TIME_LOSS_PROBE://正常重传&探测报文重传
498 tcp_send_loss_probe(sk);
499 break;
500 case ICSK_TIME_RETRANS://正常重传
501 icsk->icsk_pending = 0;
502 tcp_retransmit_timer(sk);
503 break;
504 case ICSK_TIME_PROBE0: //坚持定时器超时
505 icsk->icsk_pending = 0;
506 tcp_probe_timer(sk);
507 break;
508 }
509
510 out:
511 sk_mem_reclaim(sk); //回收缓存
512 }
513
514 static void tcp_write_timer(unsigned long data)
515 {
516 struct sock *sk = (struct sock *)data;
517
518 bh_lock_sock(sk);
519 if (!sock_owned_by_user(sk)) {
520 tcp_write_timer_handler(sk);
521 } else {
522 /* deleguate our work to tcp_release_cb() */
523 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
524 sock_hold(sk);
525 }
526 bh_unlock_sock(sk);
527 sock_put(sk);
528 }
523:如果icsk->icsk_retransmit_timer超时时socket被应用进程锁定,则设置TCP_WRITE_TIMER_DEFERRED标记,这样在应用进程释放socket时会调用tcp_release_cb函数:
741 void tcp_release_cb(struct sock *sk)
742 {
...
757 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
758 tcp_write_timer_handler(sk);
759 __sock_put(sk);
760 }
...
这样看来重传定时器真正的超时函数是tcp_retransmit_timer:
340 void tcp_retransmit_timer(struct sock *sk)
341 {
342 struct tcp_sock *tp = tcp_sk(sk);
343 struct inet_connection_sock *icsk = inet_csk(sk);
344
345 if (tp->fastopen_rsk) { //开启了TFO功能
346 WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
347 sk->sk_state != TCP_FIN_WAIT1);
348 tcp_fastopen_synack_timer(sk); //重传SYN|ACK
349 /* Before we receive ACK to our SYN-ACK don't retransmit
350 * anything else (e.g., data or FIN segments).
351 */
352 return;
353 }
354 if (!tp->packets_out) //包已经被全部确认
355 goto out;
356
357 WARN_ON(tcp_write_queue_empty(sk));
358
359 tp->tlp_high_seq = 0;
360
361 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && //发送窗口关闭且socket并非orphan socket
362 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { //并非连接建立状态
363 /* Receiver dastardly shrinks window. Our retransmits
364 * become zero probes, but we should not timeout this
365 * connection. If the socket is an orphan, time it out,
366 * we cannot allow such beasts to hang infinitely.
367 */
368 struct inet_sock *inet = inet_sk(sk);
369 if (sk->sk_family == AF_INET) {
370 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
371 &inet->inet_daddr,
372 ntohs(inet->inet_dport), inet->inet_num,
373 tp->snd_una, tp->snd_nxt);
374 }
375 #if IS_ENABLED(CONFIG_IPV6)
376 else if (sk->sk_family == AF_INET6) {
377 struct ipv6_pinfo *np = inet6_sk(sk);
378 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n"),
379 &np->daddr,
380 ntohs(inet->inet_dport), inet->inet_num,
381 tp->snd_una, tp->snd_nxt);
382 }
383 #endif
384 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { //超过TCP_RTO_MAX的时间没有收到对端的确认
385 tcp_write_err(sk); //报告错误并关闭连接
386 goto out;
387 }
388 tcp_enter_loss(sk, 0); //进入拥塞控制的LOSS状态
389 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); //重传发送队列中的首包
390 __sk_dst_reset(sk);
391 goto out_reset_timer;
392 }
393 //发送窗口非0
394 if (tcp_write_timeout(sk)) //重传等待时间过长或orphan socket消耗资源过多
395 goto out;
396
397 if (icsk->icsk_retransmits == 0) { //第一次重传
... //更新MIB数据库信息,用于网络管理
417 }
418
419 tcp_enter_loss(sk, 0); //进入拥塞控制的LOSS状态
420
421 if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {//重传发送队列中的首包失败
422 /* Retransmission failed because of local congestion,
423 * do not backoff.
424 */
425 if (!icsk->icsk_retransmits)
426 icsk->icsk_retransmits = 1;
427 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
428 min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
429 TCP_RTO_MAX);//重设重传定时器
430 goto out;
431 }
...
448 icsk->icsk_backoff++;
449 icsk->icsk_retransmits++;
450
451 out_reset_timer:
452 /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
453 * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
454 * might be increased if the stream oscillates between thin and thick,
455 * thus the old value might already be too high compared to the value
456 * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
457 * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
458 * exponential backoff behaviour to avoid continue hammering
459 * linear-timeout retransmissions into a black hole
460 */
461 if (sk->sk_state == TCP_ESTABLISHED &&
462 (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
463 tcp_stream_is_thin(tp) &&
464 icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
465 icsk->icsk_backoff = 0;
466 icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
467 } else {
468 /* Use normal (exponential) backoff */
469 icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); //增加超时时间
470 }
471 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); //重设重传定时器
472 if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
473 __sk_dst_reset(sk);
474
475 out:;
476 }
tcp_write_timeout函数判断是否应该超时: 156 static int tcp_write_timeout(struct sock *sk)
157 {
158 struct inet_connection_sock *icsk = inet_csk(sk);
159 int retry_until;
160 bool do_reset, syn_set = false;
161
162 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
163 if (icsk->icsk_retransmits)
164 dst_negative_advice(sk);
165 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
166 syn_set = true;
167 } else {
168 if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { //这里超时可能是因为MTU过大
169 /* Black hole detection */
170 tcp_mtu_probing(icsk, sk); //执行路径MTU探测
171
172 dst_negative_advice(sk);
173 }
174
175 retry_until = sysctl_tcp_retries2;
176 if (sock_flag(sk, SOCK_DEAD)) { //当前socket是orphan socket
177 const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
178
179 retry_until = tcp_orphan_retries(sk, alive);
180 do_reset = alive ||
181 !retransmits_timed_out(sk, retry_until, 0, 0);
182
183 if (tcp_out_of_resources(sk, do_reset))//当前orphan socket数量太多
184 return 1;
185 }
186 }
187
188 if (retransmits_timed_out(sk, retry_until,
189 syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { //如果超时
190 /* Has it gone just too far? */
191 tcp_write_err(sk); //关闭连接
192 return 1;
193 }
194 return 0;
195 }
retransmits_timed_out函数判断是否超时:127 static bool retransmits_timed_out(struct sock *sk,
128 unsigned int boundary,
129 unsigned int timeout,
130 bool syn_set)
131 {
132 unsigned int linear_backoff_thresh, start_ts;
133 unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
134
135 if (!inet_csk(sk)->icsk_retransmits) //没有重传
136 return false;
137
138 if (unlikely(!tcp_sk(sk)->retrans_stamp))
139 start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; //使用skb发送时间作为起始时间
140 else
141 start_ts = tcp_sk(sk)->retrans_stamp; //使用重传时间作为起始时间
142
143 if (likely(timeout == 0)) {
144 linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
145
146 if (boundary <= linear_backoff_thresh)
147 timeout = ((2 << boundary) - 1) * rto_base;
148 else
149 timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
150 (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
151 }
152 return (tcp_time_stamp - start_ts) >= timeout;
153 }
判断应该超时时使用tcp_write_err函数关闭本端连接并向应用进程报告错误: 35 static void tcp_write_err(struct sock *sk)
36 {
37 sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
38 sk->sk_error_report(sk);
39
40 tcp_done(sk);
41 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
42 }
tcp_retransmit_skb函数用于重传skb:
2374 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
2375 {
2376 struct tcp_sock *tp = tcp_sk(sk);
2377 int err = __tcp_retransmit_skb(sk, skb);
2378
2379 if (err == 0) {
2380 /* Update global TCP statistics. */
2381 TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
2382
2383 tp->total_retrans++;
2384
2385 #if FASTRETRANS_DEBUG > 0
2386 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
2387 net_dbg_ratelimited("retrans_out leaked\n");
2388 }
2389 #endif
2390 if (!tp->retrans_out)
2391 tp->lost_retrans_low = tp->snd_nxt;
2392 TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; //记录skb被重传过
2393 tp->retrans_out += tcp_skb_pcount(skb);
2394
2395 /* Save stamp of the first retransmit. */
2396 if (!tp->retrans_stamp)
2397 tp->retrans_stamp = TCP_SKB_CB(skb)->when; //记录重传时间
2398
2399 tp->undo_retrans += tcp_skb_pcount(skb);
2400
2401 /* snd_nxt is stored to detect loss of retransmitted segment,
2402 * see tcp_input.c tcp_sacktag_write_queue().
2403 */
2404 TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
2405 }
2406 return err;
2407 }
综上,TCP重传定时器的基本功能是:如果有TFO socket则直接重传SYN|ACK,然后返回;如果没有,检查重传是否经过了太长的时间,若是则关闭连接并报告错误;否则重传发送队列中的首包,并将重传定时器设置为更长的超时时间。