9.8.1 Why
TCP在收到数据后必须发送ACK给对端,但如果每收到一个包就给一个ACK的话会使得网络中被注入过多报文。TCP的做法是在收到数据时不立即发送ACK,而是设置一个定时器,如果在定时器超时之前有数据发送给对端,则ACK会被携带在数据中捎带过去;超时则由定时器发送ACK。这样就减少了报文的发送,提高了协议的效率。
9.8.2 When
设置延迟ACK的时机主要有以下几个:
(1)发送SYN后收到SYN|ACK时:
5373 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5374 const struct tcphdr *th, unsigned int len)
5375 {
...
5385 if (th->ack) {
,,,
5486 if (sk->sk_write_pending ||
5487 icsk->icsk_accept_queue.rskq_defer_accept ||
5488 icsk->icsk_ack.pingpong) {
5489 /* Save one ACK. Data will be ready after
5490 * several ticks, if write_pending is set.
5491 *
5492 * It may be deleted, but with this feature tcpdumps
5493 * look so _wonderfully_ clever, that I was not able
5494 * to stand against the temptation 8) --ANK
5495 */
5496 inet_csk_schedule_ack(sk);
5497 icsk->icsk_ack.lrcvtime = tcp_time_stamp;
5498 tcp_enter_quickack_mode(sk);
5499 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
5500 TCP_DELACK_MAX, TCP_RTO_MAX); //设置延迟ACK定时器,超时时间200ms
(2)发送ACK时无法申请skb: net/ipv4/tcp_output.c
3027 void tcp_send_ack(struct sock *sk)
3028 {
3029 struct sk_buff *buff;
3030
3031 /* If we have been reset, we may not send again. */
3032 if (sk->sk_state == TCP_CLOSE)
3033 return;
3034
3035 /* We are not putting this on the write queue, so
3036 * tcp_transmit_skb() will set the ownership to this
3037 * sock.
3038 */
3039 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3040 if (buff == NULL) {
3041 inet_csk_schedule_ack(sk);
3042 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
3043 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
3044 TCP_DELACK_MAX, TCP_RTO_MAX); //超时时间200ms
3045 return;
3046 }
...
(3)有数据放入prequeue队列中时:
1919 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1920 {
...
1933 if (tp->ucopy.memory > sk->sk_rcvbuf) {
...
1945 } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1946 wake_up_interruptible_sync_poll(sk_sleep(sk),
1947 POLLIN | POLLRDNORM | POLLRDBAND);
1948 if (!inet_csk_ack_scheduled(sk))
1949 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1950 (3 * tcp_rto_min(sk)) / 4,
1951 TCP_RTO_MAX);
...
(4)调用__tcp_ack_snd_check函数发送ACK时(发送ACK的具体情况见5.4 ACK发送与接收):
4758 static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
4759 {
4760 struct tcp_sock *tp = tcp_sk(sk);
4761
4762 /* More than one full frame received... */
4763 if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
4764 /* ... and right edge of window advances far enough.
4765 * (tcp_recvmsg() will send ACK otherwise). Or...
4766 */
4767 __tcp_select_window(sk) >= tp->rcv_wnd) ||
4768 /* We ACK each frame or... */
4769 tcp_in_quickack_mode(sk) ||
4770 /* We have out of order data. */
4771 (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
4772 /* Then ack it now */
4773 tcp_send_ack(sk);
4774 } else {
4775 /* Else, send delayed ack. */
4776 tcp_send_delayed_ack(sk);
4777 }
4778 }
根据上述代码我们总结一下(4)中使用延迟ACK定时器的条件:
(1)收到少于一个MSS的数据或通告窗口缩小
(2)没有处于快速ACK模式
(3)无乱序数据
上述条件都满足则会调用tcp_send_delayed_ack会设置延迟ACK定时器:
2974 void tcp_send_delayed_ack(struct sock *sk)
2975 {
2976 struct inet_connection_sock *icsk = inet_csk(sk);
2977 int ato = icsk->icsk_ack.ato;
2978 unsigned long timeout;
2979
2980 if (ato > TCP_DELACK_MIN) {
2981 const struct tcp_sock *tp = tcp_sk(sk);
2982 int max_ato = HZ / 2;
2983
2984 if (icsk->icsk_ack.pingpong || //交互模式
2985 (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) //需要尽快发送ACK,这时为什么要用最大的延迟?
2986 max_ato = TCP_DELACK_MAX; //允许更大的延迟
2987
2988 /* Slow path, intersegment interval is "high". */
2989
2990 /* If some rtt estimate is known, use it to bound delayed ack.
2991 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
2992 * directly.
2993 */
2994 if (tp->srtt) {
2995 int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
2996
2997 if (rtt < max_ato)
2998 max_ato = rtt;
2999 }
3000
3001 ato = min(ato, max_ato);
3002 }
3003
3004 /* Stay within the limit we were given */
3005 timeout = jiffies + ato; //超时时间小于TCP_DELACK_MIN(1/25s,即40ms)
3006
3007 /* Use new timeout only if there wasn't a older one earlier. */
3008 if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { //已经设置了延迟ACK定时器
3009 /* If delack timer was blocked or is about to expire,
3010 * send ACK now.
3011 */
3012 if (icsk->icsk_ack.blocked || //延迟ACK定时器被阻塞,可能是在延迟ACK定时器超时时socket被应用进程锁定导致ACK无法发送
3013 time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { //应该超时
3014 tcp_send_ack(sk); //立即发送ACK
3015 return;
3016 }
3017
3018 if (!time_before(timeout, icsk->icsk_ack.timeout))
3019 timeout = icsk->icsk_ack.timeout;
3020 }
3021 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
3022 icsk->icsk_ack.timeout = timeout;
3023 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); //安装延迟ACK定时器
3024 }
发送ACK时清除延迟ACK定时器:
178 static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
179 {
180 tcp_dec_quickack_mode(sk, pkts);
181 inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
182 }
...
828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
829 gfp_t gfp_mask)
830 {
...
940 if (likely(tcb->tcp_flags & TCPHDR_ACK))
941 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
9.8.3 What
延迟ACK定时器的超时函数为tcp_delack_timer:
197 void tcp_delack_timer_handler(struct sock *sk)
198 {
199 struct tcp_sock *tp = tcp_sk(sk);
200 struct inet_connection_sock *icsk = inet_csk(sk);
201
202 sk_mem_reclaim_partial(sk);
203
204 if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
205 goto out;
206
207 if (time_after(icsk->icsk_ack.timeout, jiffies)) { //未到超时时间
208 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
209 goto out;
210 }
211 icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
212
213 if (!skb_queue_empty(&tp->ucopy.prequeue)) { //处理prequeue队列
214 struct sk_buff *skb;
215
216 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
217
218 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
219 sk_backlog_rcv(sk, skb);
220
221 tp->ucopy.memory = 0;
222 }
223
224 if (inet_csk_ack_scheduled(sk)) { //需要发送ACK
225 if (!icsk->icsk_ack.pingpong) { //非交互模式要尽快发送ACK
226 /* Delayed ACK missed: inflate ATO. */
227 icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
228 } else { //交互模式允许更大的延迟
229 /* Delayed ACK missed: leave pingpong mode and
230 * deflate ATO.
231 */
232 icsk->icsk_ack.pingpong = 0;
233 icsk->icsk_ack.ato = TCP_ATO_MIN;
234 }
235 tcp_send_ack(sk); //发送ACK
236 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
237 }
238
239 out:
240 if (sk_under_memory_pressure(sk))
241 sk_mem_reclaim(sk);
242 }
243
244 static void tcp_delack_timer(unsigned long data)
245 {
246 struct sock *sk = (struct sock *)data;
247
248 bh_lock_sock(sk);
249 if (!sock_owned_by_user(sk)) {
250 tcp_delack_timer_handler(sk);
251 } else {
252 inet_csk(sk)->icsk_ack.blocked = 1; //标识延迟ACK被锁定,以后安装延迟ACK定时器时要立即发送ACK
253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
254 /* deleguate our work to tcp_release_cb() */
255 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
256 sock_hold(sk);
257 }
258 bh_unlock_sock(sk);
259 sock_put(sk);
260 }
255:如果延迟ACK定时器超时时socket被应用进程锁定,则设置TCP_DELACK_TIMER_DEFERRED标记,这样在应用进程释放socket时会调用tcp_release_cb函数: 741 void tcp_release_cb(struct sock *sk)
742 {
...
748 flags = tp->tsq_flags;
...
761 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
762 tcp_delack_timer_handler(sk);
763 __sock_put(sk);
764 }
...
tcp_delack_timer_handler函数最终也会获得运行机会。