9.8.1 Why
TCP在收到數據後必須發送ACK給對端,但如果每收到一個包就給一個ACK的話會使得網絡中被注入過多報文。TCP的做法是在收到數據時不立即發送ACK,而是設置一個定時器,如果在定時器超時之前有數據發送給對端,則ACK會被攜帶在數據中捎帶過去;超時則由定時器發送ACK。這樣就減少了報文的發送,提高了協議的效率。
9.8.2 When
設置延遲ACK的時機主要有以下幾個:
(1)發送SYN後收到SYN|ACK時:
5373 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5374 const struct tcphdr *th, unsigned int len)
5375 {
...
5385 if (th->ack) {
,,,
5486 if (sk->sk_write_pending ||
5487 icsk->icsk_accept_queue.rskq_defer_accept ||
5488 icsk->icsk_ack.pingpong) {
5489 /* Save one ACK. Data will be ready after
5490 * several ticks, if write_pending is set.
5491 *
5492 * It may be deleted, but with this feature tcpdumps
5493 * look so _wonderfully_ clever, that I was not able
5494 * to stand against the temptation 8) --ANK
5495 */
5496 inet_csk_schedule_ack(sk);
5497 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5498 tcp_enter_quickack_mode(sk);
5499 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5500 TCP_DELACK_MAX, TCP_RTO_MAX); //設置延遲ACK定時器,超時時間200ms
(2)發送ACK時無法申請skb: net/ipv4/tcp_output.c
3027 void tcp_send_ack(struct sock *sk)
3028 {
3029 struct sk_buff *buff;
3030
3031 /* If we have been reset, we may not send again. */
3032 if (sk->sk_state == TCP_CLOSE)
3033 return;
3034
3035 /* We are not putting this on the write queue, so
3036 * tcp_transmit_skb() will set the ownership to this
3037 * sock.
3038 */
3039 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3040 if (buff == NULL) {
3041 inet_csk_schedule_ack(sk);
3042 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3043 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3044 TCP_DELACK_MAX, TCP_RTO_MAX); //超時時間200ms
3045 return;
3046 }
...
(3)有數據放入prequeue隊列中時:
1919 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1920 {
...
1933 if (tp->ucopy.memory > sk->sk_rcvbuf) {
...
1945 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1946 wake_up_interruptible_sync_poll(sk_sleep(sk),
1947 POLLIN | POLLRDNORM | POLLRDBAND);
1948 if (!inet_csk_ack_scheduled(sk))
1949 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1950 (3 * tcp_rto_min(sk)) / 4,
1951 TCP_RTO_MAX);
...
(4)調用__tcp_ack_snd_check函數發送ACK時(發送ACK的具體情況見5.4 ACK發送與接收):
4758 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
4759 {
4760 struct tcp_sock *tp = tcp_sk(sk);
4761
4762 /* More than one full frame received... */
4763 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
4764 /* ... and right edge of window advances far enough.
4765 * (tcp_recvmsg() will send ACK otherwise). Or...
4766 */
4767 __tcp_select_window(sk) >= tp->rcv_wnd) ||
4768 /* We ACK each frame or... */
4769 tcp_in_quickack_mode(sk) ||
4770 /* We have out of order data. */
4771 (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
4772 /* Then ack it now */
4773 tcp_send_ack(sk);
4774 } else {
4775 /* Else, send delayed ack. */
4776 tcp_send_delayed_ack(sk);
4777 }
4778 }
根據上述代碼我們總結一下(4)中使用延遲ACK定時器的條件:
(1)收到少於一個MSS的數據或通告窗口縮小
(2)沒有處於快速ACK模式
(3)無亂序數據
上述條件都滿足則會調用tcp_send_delayed_ack會設置延遲ACK定時器:
2974 void tcp_send_delayed_ack(struct sock *sk)
2975 {
2976 struct inet_connection_sock *icsk = inet_csk(sk);
2977 int ato = icsk->icsk_ack.ato;
2978 unsigned long timeout;
2979
2980 if (ato > TCP_DELACK_MIN) {
2981 const struct tcp_sock *tp = tcp_sk(sk);
2982 int max_ato = HZ / 2;
2983
2984 if (icsk->icsk_ack.pingpong || //交互模式
2985 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) //需要儘快發送ACK,這時爲什麼要用最大的延遲?
2986 max_ato = TCP_DELACK_MAX; //允許更大的延遲
2987
2988 /* Slow path, intersegment interval is "high". */
2989
2990 /* If some rtt estimate is known, use it to bound delayed ack.
2991 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
2992 * directly.
2993 */
2994 if (tp->srtt) {
2995 int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
2996
2997 if (rtt < max_ato)
2998 max_ato = rtt;
2999 }
3000
3001 ato = min(ato, max_ato);
3002 }
3003
3004 /* Stay within the limit we were given */
3005 timeout = jiffies + ato; //超時時間小於TCP_DELACK_MIN(1/25s,即40ms)
3006
3007 /* Use new timeout only if there wasn't a older one earlier. */
3008 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { //已經設置了延遲ACK定時器
3009 /* If delack timer was blocked or is about to expire,
3010 * send ACK now.
3011 */
3012 if (icsk->icsk_ack.blocked || //延遲ACK定時器被阻塞,可能是在延遲ACK定時器超時時socket被應用進程鎖定導致ACK無法發送
3013 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { //應該超時
3014 tcp_send_ack(sk); //立即發送ACK
3015 return;
3016 }
3017
3018 if (!time_before(timeout, icsk->icsk_ack.timeout))
3019 timeout = icsk->icsk_ack.timeout;
3020 }
3021 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3022 icsk->icsk_ack.timeout = timeout;
3023 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); //安裝延遲ACK定時器
3024 }
發送ACK時清除延遲ACK定時器:
178 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
179 {
180 tcp_dec_quickack_mode(sk, pkts);
181 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
182 }
...
828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
829 gfp_t gfp_mask)
830 {
...
940 if (likely(tcb->tcp_flags & TCPHDR_ACK))
941 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
9.8.3 What
延遲ACK定時器的超時函數爲tcp_delack_timer:
197 void tcp_delack_timer_handler(struct sock *sk)
198 {
199 struct tcp_sock *tp = tcp_sk(sk);
200 struct inet_connection_sock *icsk = inet_csk(sk);
201
202 sk_mem_reclaim_partial(sk);
203
204 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
205 goto out;
206
207 if (time_after(icsk->icsk_ack.timeout, jiffies)) { //未到超時時間
208 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
209 goto out;
210 }
211 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
212
213 if (!skb_queue_empty(&tp->ucopy.prequeue)) { //處理prequeue隊列
214 struct sk_buff *skb;
215
216 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
217
218 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
219 sk_backlog_rcv(sk, skb);
220
221 tp->ucopy.memory = 0;
222 }
223
224 if (inet_csk_ack_scheduled(sk)) { //需要發送ACK
225 if (!icsk->icsk_ack.pingpong) { //非交互模式要儘快發送ACK
226 /* Delayed ACK missed: inflate ATO. */
227 icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
228 } else { //交互模式允許更大的延遲
229 /* Delayed ACK missed: leave pingpong mode and
230 * deflate ATO.
231 */
232 icsk->icsk_ack.pingpong = 0;
233 icsk->icsk_ack.ato = TCP_ATO_MIN;
234 }
235 tcp_send_ack(sk); //發送ACK
236 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
237 }
238
239 out:
240 if (sk_under_memory_pressure(sk))
241 sk_mem_reclaim(sk);
242 }
243
244 static void tcp_delack_timer(unsigned long data)
245 {
246 struct sock *sk = (struct sock *)data;
247
248 bh_lock_sock(sk);
249 if (!sock_owned_by_user(sk)) {
250 tcp_delack_timer_handler(sk);
251 } else {
252 inet_csk(sk)->icsk_ack.blocked = 1; //標識延遲ACK被鎖定,以後安裝延遲ACK定時器時要立即發送ACK
253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
254 /* deleguate our work to tcp_release_cb() */
255 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
256 sock_hold(sk);
257 }
258 bh_unlock_sock(sk);
259 sock_put(sk);
260 }
255:如果延遲ACK定時器超時時socket被應用進程鎖定,則設置TCP_DELACK_TIMER_DEFERRED標記,這樣在應用進程釋放socket時會調用tcp_release_cb函數: 741 void tcp_release_cb(struct sock *sk)
742 {
...
748 flags = tp->tsq_flags;
...
761 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
762 tcp_delack_timer_handler(sk);
763 __sock_put(sk);
764 }
...
tcp_delack_timer_handler函數最終也會獲得運行機會。