9.6.1 Why
數據發送方收到接收方的通告窗口爲0時,就不能再發送數據,一直等到對方發送窗口更新爲止。但對端發送的窗口更新報文可能會丟失,如果發送方只是等待的話會導致數據傳輸會一直停滯,最後連接會被斷開。這時堅持定時器閃亮登場!數據發送方可以設置堅持定時器定時發送1個探測報文,對端收到後會對這個報文發送ACK報文,這樣發送方就能及時得知窗口更新事件了。一旦窗口非0則數據傳輸就可以恢復正常的數據傳輸。
9.6.2 When
設置堅持定時器的時機有兩個:
(1)TCP使用__tcp_push_pending_frames發送數據時:
2032 void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2033 int nonagle)
2034 {
2035 /* If we are closed, the bytes will have to remain here.
2036 * In time closedown will finish, we empty the write queue and
2037 * all will be happy.
2038 */
2039 if (unlikely(sk->sk_state == TCP_CLOSE))
2040 return;
2041
2042 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2043 sk_gfp_atomic(sk, GFP_ATOMIC))) //判斷爲真意味着所有發送出去的數據都已經被確認且發送隊列中還有數據未發送,即可能是因爲窗口太小無法發送
2044 tcp_check_probe_timer(sk); //設置堅持定時器
2045 }
tcp_check_probe_timer函數: 973 static inline void tcp_check_probe_timer(struct sock *sk)
974 {
975 const struct tcp_sock *tp = tcp_sk(sk);
976 const struct inet_connection_sock *icsk = inet_csk(sk);
977
978 if (!tp->packets_out && !icsk->icsk_pending) //所有發送出去的數據都已經被確認且未設置堅持定時器、重傳定時器、ER定時器和TLP定時器
979 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
980 icsk->icsk_rto, TCP_RTO_MAX);
981 }
(2)收到ACK時: 3168 static void tcp_ack_probe(struct sock *sk)
3169 {
3170 const struct tcp_sock *tp = tcp_sk(sk);
3171 struct inet_connection_sock *icsk = inet_csk(sk);
3172
3173 /* Was it a usable window open? */
3174
3175 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { //當前窗口可以容納下一個要發送的包
3176 icsk->icsk_backoff = 0;
3177 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);//清除堅持定時器
3178 /* Socket must be waked up by subsequent tcp_data_snd_check().
3179 * This function is not for random using!
3180 */
3181 } else {
3182 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3183 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
3184 TCP_RTO_MAX);
3185 }
3186 }
...
3325 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3326 {
...
3365 prior_fackets = tp->fackets_out;
...
3409 if (!prior_packets)//所有發送出去的數據都已經被確認
3410 goto no_queue;
...
3443 no_queue:
...
3452 if (tcp_send_head(sk))//發送隊列中還有數據未發送
3453 tcp_ack_probe(sk);//設置堅持定時器
設置堅持定時器的條件是:所有發送出去的數據都已經被確認且發送隊列中還有數據未發送(這時不會設置重傳定時器、ER定時器和TLP定時器)。數據未發送的原因可能是發送窗口過小。
清除設置堅持定時器的條件是:
(1)發送窗口增大到能夠允許發送至少一個報文
(2)安裝了重傳定時器、ER定時器或TLP定時器
堅持定時器的超時時間由RTO決定。
9.6.3 What
堅持定時器的超時函數爲tcp_probe_timer:
262 static void tcp_probe_timer(struct sock *sk)
263 {
264 struct inet_connection_sock *icsk = inet_csk(sk);
265 struct tcp_sock *tp = tcp_sk(sk);
266 int max_probes;
267
268 if (tp->packets_out || !tcp_send_head(sk)) { //有發送出去的數據未確認或發送隊列爲空
269 icsk->icsk_probes_out = 0;
270 return;
271 }
...
288 max_probes = sysctl_tcp_retries2;
289
290 if (sock_flag(sk, SOCK_DEAD)) {//當前socket是孤兒socket
291 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
292
293 max_probes = tcp_orphan_retries(sk, alive);
294
295 if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) //孤兒socket佔用資源過多
296 return;
297 }
298
299 if (icsk->icsk_probes_out > max_probes) { //探測次數超出上限
300 tcp_write_err(sk);
301 } else {
302 /* Only send another probe if we didn't close things up. */
303 tcp_send_probe0(sk); //發送探測報文
304 }
305 }
tcp_send_probe0會發送探測報文: 3099 int tcp_write_wakeup(struct sock *sk)
3100 {
3101 struct tcp_sock *tp = tcp_sk(sk);
3102 struct sk_buff *skb;
3103
3104 if (sk->sk_state == TCP_CLOSE)
3105 return -1;
3106
3107 if ((skb = tcp_send_head(sk)) != NULL && //有數據尚未發送
3108 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { //當前窗口允許發送至少1字節的新數據
3109 int err;
3110 unsigned int mss = tcp_current_mss(sk);
3111 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3112
3113 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
3114 tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
3115
3116 /* We are probing the opening of a window
3117 * but the window size is != 0
3118 * must have been a result SWS avoidance ( sender )
3119 */
3120 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
3121 skb->len > mss) { //數據段過大
3122 seg_size = min(seg_size, mss);
3123 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3124 if (tcp_fragment(sk, skb, seg_size, mss))
3125 return -1;
3126 } else if (!tcp_skb_pcount(skb))
3127 tcp_set_skb_tso_segs(sk, skb, mss);
3128
3129 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
3130 TCP_SKB_CB(skb)->when = tcp_time_stamp;
3131 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); //發送新數據作爲探測報文
3132 if (!err) //發送成功
3133 tcp_event_new_data_sent(sk, skb); //處理髮送了新數據的事件
3134 return err;
3135 } else { //沒有數據要發送或當前發送窗口不允許發送新數據
3136 if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) //有緊急數據未確認且在窗口之內,這時一定有數據要發送
3137 tcp_xmit_probe_skb(sk, 1); //發送一個使用重複序列號的ACK
3138 return tcp_xmit_probe_skb(sk, 0); //發送一個使用舊序列號的ACK
3139 }
3140 }
3141
3142 /* A window probe timeout has occurred. If window is not closed send
3143 * a partial packet else a zero probe.
3144 */
3145 void tcp_send_probe0(struct sock *sk)
3146 {
3147 struct inet_connection_sock *icsk = inet_csk(sk);
3148 struct tcp_sock *tp = tcp_sk(sk);
3149 int err;
3150
3151 err = tcp_write_wakeup(sk); //發送探測報文
3152
3153 if (tp->packets_out || !tcp_send_head(sk)) { //有發送出去的數據未確認或發送隊列爲空
3154 /* Cancel probe timer, if it is not required. */
3155 icsk->icsk_probes_out = 0;
3156 icsk->icsk_backoff = 0;
3157 return;
3158 }
3159
3160 if (err <= 0) {
3161 if (icsk->icsk_backoff < sysctl_tcp_retries2)
3162 icsk->icsk_backoff++;
3163 icsk->icsk_probes_out++;
3164 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3165 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
3166 TCP_RTO_MAX);
3167 } else { //包在底層由於隊列擁塞沒有發送出去
3168 /* If packet was not sent due to local congestion,
3169 * do not backoff and do not remember icsk_probes_out.
3170 * Let local senders to fight for local resources.
3171 *
3172 * Use accumulated backoff yet.
3173 */
3174 if (!icsk->icsk_probes_out)
3175 icsk->icsk_probes_out = 1;
3176 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3177 min(icsk->icsk_rto << icsk->icsk_backoff,
3178 TCP_RESOURCE_PROBE_INTERVAL),
3179 TCP_RTO_MAX);
3180 }
3181 }
tcp_xmit_probe_skb函數用於發送一個無數據的報文:3068 static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3069 {
3070 struct tcp_sock *tp = tcp_sk(sk);
3071 struct sk_buff *skb;
3072
3073 /* We don't queue it, tcp_transmit_skb() sets ownership. */
3074 skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3075 if (skb == NULL)
3076 return -1;
3077
3078 /* Reserve space for headers and set control bits. */
3079 skb_reserve(skb, MAX_TCP_HEADER);
3080 /* Use a previous sequence. This should cause the other
3081 * end to send an ack. Don't queue or clone SKB, just
3082 * send it.
3083 */
3084 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
3085 TCP_SKB_CB(skb)->when = tcp_time_stamp;
3086 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
3087 }
如果探測次數超出限制或內存緊張,堅持定時器會斷開連接;否則,發送探測報文,然後重設堅持定時器。發送探測報文時如果發送窗口允許發送至少一字節,則發送一個新的報文段;否則發送一個seq比較舊的非法ACK,這樣對端收到後會丟棄之併發送ACK報文(如果有緊急數據未確認則發送一個seq最舊但合法的ACK,why?)。
總之,堅持定時器發送探測報文並期望對端能對探測報文發送ACK,這樣TCP就能得到最新的窗口信息。一旦窗口增加到可以發送數據,則正常的數據交互就可以儘快恢復。