9.7.1 Why
当单工模式下TCP数据发送方发送了一些数据后就不再发数据,数据接收方也不会发送报文,这时TCP连接处于静止状态(比如Telnet应用)。保活功能可以使用保活定时器向对端发送探测报文来确定对端的连接是否正常,如果对端有回应则继续维持连接,否则关闭连接,释放资源。开启保活功能需要使用SO_KEEPALIVE socket选项。
9.7.2 When
设置保活定时器的时机主要有三个:
(1)客户端发送SYN后收到SYN|ACK,调用tcp_finish_connect函数完成连接时:
5291 void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5292 {
5293 struct tcp_sock *tp = tcp_sk(sk);
5294 struct inet_connection_sock *icsk = inet_csk(sk);
5295
5296 tcp_set_state(sk, TCP_ESTABLISHED);
...
5317 if (sock_flag(sk, SOCK_KEEPOPEN)) //应用进程开启keepalive服务
5318 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
...
5373 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5374 const struct tcphdr *th, unsigned int len)
5375 {
...
5480 tcp_finish_connect(sk, skb);
...
(2)服务器端发送SYN|ACK后收到合法的ACK,调用tcp_create_openreq_child创建子socket时: 381 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
382 {
383 struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
384
385 if (newsk != NULL) {
...
441 if (sock_flag(newsk, SOCK_KEEPOPEN)) //应用进程开启keepalive服务
442 inet_csk_reset_keepalive_timer(newsk,
443 keepalive_time_when(newtp));
...
(3)使用SO_KEEPALIVE socket选项开启保活功能时: 621 int sock_setsockopt(struct socket *sock, int level, int optname,
622 char __user *optval, unsigned int optlen)
623 {
...
727 case SO_KEEPALIVE:
728 #ifdef CONFIG_INET
729 if (sk->sk_protocol == IPPROTO_TCP &&
730 sk->sk_type == SOCK_STREAM)
731 tcp_set_keepalive(sk, valbool);
732 #endif
733 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
734 break;
tcp_set_keepalive函数用于开启或关闭keepalive服务:546 void tcp_set_keepalive(struct sock *sk, int val)
547 {
548 if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
549 return;
550
551 if (val && !sock_flag(sk, SOCK_KEEPOPEN))
552 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
553 else if (!val)
554 inet_csk_delete_keepalive_timer(sk); //拆除keepalive定时器
555 }
拆除keepalive定时器只能使用SO_KEEPALIVE socket选项。Keepalive定时器的超时时间由keepalive_time_when函数决定:
1114 static inline int keepalive_time_when(const struct tcp_sock *tp)
1115 {
1116 return tp->keepalive_time ? : sysctl_tcp_keepalive_time;
1117 }
其中sysctl_tcp_keepalive_tim由net.ipv4.tcp_keepalive_time内核参数设定,tp->keepalive_time由TCP_KEEPIDLE
socket选项设置:
621 int sock_setsockopt(struct socket *sock, int level, int optname,
622 char __user *optval, unsigned int optlen)
623 {
...
2525 case TCP_KEEPIDLE:
2526 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2527 err = -EINVAL;
2528 else {
2529 tp->keepalive_time = val * HZ;
2530 if (sock_flag(sk, SOCK_KEEPOPEN) &&
2531 !((1 << sk->sk_state) &
2532 (TCPF_CLOSE | TCPF_LISTEN))) {
2533 u32 elapsed = keepalive_time_elapsed(tp);
2534 if (tp->keepalive_time > elapsed)
2535 elapsed = tp->keepalive_time - elapsed;
2536 else
2537 elapsed = 0;
2538 inet_csk_reset_keepalive_timer(sk, elapsed);
2539 }
2540 }
2541 break;
Keepalive定时器默认超时时间为TCP_KEEPALIVE_TIME(2小时)。
9.7.3 What
保活定时器的超时为tcp_keepalive_timer:
558 static void tcp_keepalive_timer (unsigned long data)
559 {
560 struct sock *sk = (struct sock *) data;
561 struct inet_connection_sock *icsk = inet_csk(sk);
562 struct tcp_sock *tp = tcp_sk(sk);
563 u32 elapsed;
564
565 /* Only process if socket is not in use. */
566 bh_lock_sock(sk);
567 if (sock_owned_by_user(sk)) {
568 /* Try again later. */
569 inet_csk_reset_keepalive_timer (sk, HZ/20);
570 goto out;
571 }
572
573 if (sk->sk_state == TCP_LISTEN) { //SYN-ACK定时器超时处理
574 tcp_synack_timer(sk);
575 goto out;
576 }
577
578 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {//FIN_WAIT2定时器超时处理
...
589 }
590
591 if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) //keepalive功能未开启或socket已经关闭
592 goto out;
593
594 elapsed = keepalive_time_when(tp);
595
596 /* It is alive without keepalive 8) */
597 if (tp->packets_out || tcp_send_head(sk)) //有包在网络中或有数据未发
598 goto resched; //连接是活动的,无需keepalive定时器操心
599
600 elapsed = keepalive_time_elapsed(tp); //计算自收到最后一个包到现在经历了多长时间
601
602 if (elapsed >= keepalive_time_when(tp)) { //计算自收到最后一个包到现在经历的时间达到保活定时器的超时门限
603 /* If the TCP_USER_TIMEOUT option is enabled, use that
604 * to determine when to timeout instead.
605 */
606 if ((icsk->icsk_user_timeout != 0 && //应用进程使用TCP_USER_TIMEOUT socket选项设置了超时时间
607 elapsed >= icsk->icsk_user_timeout && //未活动时间超过应用进程的限制
608 icsk->icsk_probes_out > 0) || //发送过探测报文
609 (icsk->icsk_user_timeout == 0 && 应用进程未设置超时时间
610 icsk->icsk_probes_out >= keepalive_probes(tp))) { //探测次数超过用户设定的门限
611 tcp_send_active_reset(sk, GFP_ATOMIC); //发送RST复位连接
612 tcp_write_err(sk); //发送出错报告,关闭本端连接
613 goto out;
614 }
615 if (tcp_write_wakeup(sk) <= 0) { //发送探测包
616 icsk->icsk_probes_out++; //此计数在收到ACK时清零
617 elapsed = keepalive_intvl_when(tp); //设置超时时间为探测间隔时间
618 } else { //由于底层队列拥塞包没有发送出去
619 /* If keepalive was lost due to local congestion,
620 * try harder.
621 */
622 elapsed = TCP_RESOURCE_PROBE_INTERVAL; //缩短时间间隔为0.5s
623 }
624 } else {
625 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
626 elapsed = keepalive_time_when(tp) - elapsed; //设置keepalive在从收到最后一个包开始到一个超时周期时再超时
627 }
628
629 sk_mem_reclaim(sk); //回收内存资源
630
631 resched:
632 inet_csk_reset_keepalive_timer (sk, elapsed); //重新设置保活定时器
633 goto out;
...
638 out:
639 bh_unlock_sock(sk);
640 sock_put(sk);
641 }
keepalive_probes函数返回最大探测次数:
1119 static inline int keepalive_probes(const struct tcp_sock *tp)
1120 {
1121 return tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1122 }
其中,tp->keepalive_probes由TCP_KEEPCNT socket选项设定,sysctl_tcp_keepalive_probes(默认为TCP_KEEPALIVE_PROBES,即9)由net.ipv4.tcp_keepalive_probes内核参数设定。 keepalive_intvl_when函数返回探测间隔时间:
1109 static inline int keepalive_intvl_when(const struct tcp_sock *tp)
1110 {
1111 return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl;
1112 }
其中,tp->keepalive_intvl由TCP_KEEPINTVL socket选项设定,sysctl_tcp_keepalive_intvl(默认是TCP_KEEPALIVE_INTVL,即75s)由net.ipv4.tcp_keepalive_intvl内核参数设定。 现总结一下keepalive定时器的特性。在连接建立完成伊始就设置keepalive定时器,应用进程也可以使用socket选项设置或禁用它;每次定时器超时的时候,在自收到最后一个包到现在经历的时间超过保活定时器的超时门限的情况下,如果超过了应用进程设定的超时上限或探测次数则发送RST报文给对端并关闭连接,否则发送探测报文(发送探测报文的过程见9.6
坚持(Persist)定时器),增加探测计数,并将超时时间设置为keepalive_intvl,等待下次超时。当收到ACK时探测计数清零,收包时间也会刷新,整个探测过程重新开始。