9.10 TIME_WAIT定時器

9.10.1 Why

當socekt進入TIME_WAIT狀態後，TIME_WAIT定時器啓動。在超時之前，替代socket的tw sock會處理舊連接中的包，阻止其危害新連接。定時器超時後，tw sock被刪除，並釋放其佔用的端口號。

9.10.2 When

TIME_WAIT定時器的安裝由tcp_time_wait函數完成，調用tcp_time_wait函數的時機有：

（1）在TCP_FIN_WAIT2狀態下socket關閉，沒有用TCP_LINGER2選項將tp->linger2設置爲小於0且tcp_fin_time的大小小於等於TCP_TIMEWAIT_LEN：

2059 void tcp_close(struct sock *sk, long timeout)
2060 {
...
2183     if (sk->sk_state == TCP_FIN_WAIT2) {
2184         struct tcp_sock *tp = tcp_sk(sk);
2185         if (tp->linger2 < 0) {
...
2190         } else {
2191             const int tmo = tcp_fin_time(sk);
2192 
2193             if (tmo > TCP_TIMEWAIT_LEN) {
2194                 inet_csk_reset_keepalive_timer(sk,
2195                         tmo - TCP_TIMEWAIT_LEN);
2196             } else {
2197                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2198                 goto out;
2199             }
2200         }
...

（2）TCP_FIN_WAIT2狀態下收到FIN併發送ACK後：

3783 static void tcp_fin(struct sock *sk)
3784 {
...
3818     case TCP_FIN_WAIT2:
3819         /* Received a FIN -- send ACK and enter TIME_WAIT. */
3820         tcp_send_ack(sk);
3821         tcp_time_wait(sk, TCP_TIME_WAIT, 0);

（3）孤兒socket在TCP_FIN_WAIT1狀態下收到ACK時，滿足：

1）沒有用TCP_LINGER2選項將tp->linger2設置爲小於0

2）tcp_fin_time的大小小於等於TCP_TIMEWAIT_LEN：

3）ACK中沒有數據或數據全是舊的

4）ACK中沒有FIN標記並且socket沒有被應用進程鎖定

5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601               const struct tcphdr *th, unsigned int len)
5602 {
...
5751         case TCP_FIN_WAIT1:
...
5780                 if (!sock_flag(sk, SOCK_DEAD))
5781                     /* Wake up lingering close() */
5782                     sk->sk_state_change(sk);
5783                 else {
5784                     int tmo;
5785 
5786                     if (tp->linger2 < 0 ||
5787                         (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
5788                          after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
5789                         tcp_done(sk);
5790                         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
5791                         return 1;
5792                     }
5793 
5794                     tmo = tcp_fin_time(sk);
5795                     if (tmo > TCP_TIMEWAIT_LEN) {
5796                         inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
5797                     } else if (th->fin || sock_owned_by_user(sk)) {
5798                         /* Bad case. We could lose such FIN otherwise.
5799                          * It is not a big problem, but it looks confusing
5800                          * and not so rare event. We still can lose it now,
5801                          * if it spins in bh_lock_sock(), but it is really
5802                          * marginal case.
5803                          */
5804                         inet_csk_reset_keepalive_timer(sk, tmo);
5805                     } else {
5806                         tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
5807                         goto discard;
5808                     }
...

（4）TCP在TCP_CLOSING狀態下收到ACK時：

5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601               const struct tcphdr *th, unsigned int len)
5602 {
...
5813         case TCP_CLOSING:
5814             if (tp->snd_una == tp->write_seq) {
5815                 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
5816                 goto discard;
5817             }
...

（5）FIN_WAIT2定時器超時時，沒有用TCP_LINGER2選項將tp->linger2設置爲小於0且tcp_fin_time的大小大於TCP_TIMEWAIT_LEN：

558 static void tcp_keepalive_timer (unsigned long data)
559 {
...
578     if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
579         if (tp->linger2 >= 0) {
580             const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
581 
582             if (tmo > 0) {
583                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
584                 goto out;
585             }
...

tcp_time_wait函數會調用inet_twsk_schedule函數安裝TIME_WAIT定時器：

266 void tcp_time_wait(struct sock *sk, int state, int timeo)
267 {
...
327         __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); //將tw sock放入ESTABLESHED hash表和bind hash表中，將sk從ESTABLISHED hash表中移除
328 
329         /* Get the TIME_WAIT timeout firing. */
330         if (timeo < rto)
331             timeo = rto;
332 
333         if (recycle_ok) {
334             tw->tw_timeout = rto;
335         } else {
336             tw->tw_timeout = TCP_TIMEWAIT_LEN;
337             if (state == TCP_TIME_WAIT)
338                 timeo = TCP_TIMEWAIT_LEN;
339         }
340 
341         inet_twsk_schedule(tw, &tcp_death_row, timeo,
342                    TCP_TIMEWAIT_LEN);
343         inet_twsk_put(tw);
...

__inet_twsk_hashdance函數將tw_sock加入到bind hash表和ESTABLISHED表中，這樣在tw_sock被刪除之前相應IP|端口不允許bind，也不允許建立：

126 void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
127                struct inet_hashinfo *hashinfo)
128 {
129     const struct inet_sock *inet = inet_sk(sk); 
130     const struct inet_connection_sock *icsk = inet_csk(sk);
131     struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
132     spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
133     struct inet_bind_hashbucket *bhead;
134     /* Step 1: Put TW into bind hash. Original socket stays there too.
135        Note, that any socket with inet->num != 0 MUST be bound in
136        binding cache, even if it is closed.
137      */
138     bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
139             hashinfo->bhash_size)];        
140     spin_lock(&bhead->lock);
141     tw->tw_tb = icsk->icsk_bind_hash;
142     WARN_ON(!icsk->icsk_bind_hash);
143     inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);    //加入到bind hash表中
144     spin_unlock(&bhead->lock);
145 
146     spin_lock(lock);
...
153     inet_twsk_add_node_rcu(tw, &ehead->twchain);  //加入到ESBABLISHED hash表中
154 
155     /* Step 3: Remove SK from established hash. */
156     if (__sk_nulls_del_node_init_rcu(sk))
157         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
...
167     atomic_add(1 + 1 + 1, &tw->tw_refcnt);
168 
169     spin_unlock(lock);
170 }

這樣，在應用進程使用bind系統調用綁定與tw_sock相同的IP|端口對時內核會用到inet_csk_bind_conflict函數，但由於成功匹配到bind hash表中的tw_sock，會導致衝突，無法bind（詳見2.2 Bind系統調用）。而在建立連接時，inet_hash_connect函數會調用__inet_check_established檢查即將建立的連接是否與已建立的連接衝突：

311 static int __inet_check_established(struct inet_timewait_death_row *death_row,
312                     struct sock *sk, __u16 lport,
313                     struct inet_timewait_sock **twp)
314 {
...
335     sk_nulls_for_each(sk2, node, &head->twchain) {
336         if (sk2->sk_hash != hash)
337             continue;
338 
339         if (likely(INET_TW_MATCH(sk2, net, acookie,
340                      saddr, daddr, ports, dif))) {    //地址|端口匹配
341             tw = inet_twsk(sk2);
342             if (twsk_unique(sk, sk2, twp))    //調用tcp_twsk_unique判斷是否衝突
343                 goto unique;    //不衝突
344             else
345                 goto not_unique; //衝突
346         }
347     }
348     tw = NULL;
...
359 unique:
...
376     if (twp) {
377         *twp = tw;    //交給調用者處理
378     } else if (tw) {
379         /* Silly. Should hash-dance instead... */
380         inet_twsk_deschedule(tw, death_row);
381 
382         inet_twsk_put(tw);
383     }
384     return 0;
385 
386 not_unique:
387     spin_unlock(lock);
388     return -EADDRNOTAVAIL;
389 }

tcp_twsk_unique函數

 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {   
 111     const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112     struct tcp_sock *tp = tcp_sk(sk);
...
 125     if (tcptw->tw_ts_recent_stamp &&    //開啓時間戳選項且在TIME_WAIT狀態下收到過包
 126         (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                  get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128         tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129         if (tp->write_seq == 0)
 130             tp->write_seq = 1;
 131         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133         sock_hold(sktw);
 134         return 1;
 135     }
 136 
 137     return 0;
 138 }

可見，當：

（1）__inet_check_established函數的調用者不需要返回tw_sock的時候（即twp == NULL爲真），或

（2）應用進程設置了net.ipv4.tcp_tw_reuse內核選項允許tw_sock重用時，

tcp_twsk_unique函數會返回1，即不衝突。不衝突時如果是（1），則__inet_check_established函數會釋放tw_sock；否則會將tw_sock返回給調用者inet_hash_connect函數處理。在不衝突時，情況（1）發生時到底意味着什麼？情況（1）沒有發生時inet_hash_connect函數用tw_sock幹什麼？來看代碼：

589 int inet_hash_connect(struct inet_timewait_death_row *death_row,
590               struct sock *sk)
591 {
592     return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
593             __inet_check_established, __inet_hash_nolisten);
594 }

看來__inet_check_established函數的使用者是__inet_hash_connect函數：

477 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
478         struct sock *sk, u32 port_offset,
479         int (*check_established)(struct inet_timewait_death_row *,
480             struct sock *, __u16, struct inet_timewait_sock **),
481         int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
482 {
...
491     if (!snum) {
...
520                     if (!check_established(death_row, sk,
521                                 port, &tw))
522                         goto ok;
...
544 ok:
545         hint += i;
546 
547         /* Head lock still held and bh's disabled */
548         inet_bind_hash(sk, tb, port);
549         if (sk_unhashed(sk)) {
550             inet_sk(sk)->inet_sport = htons(port);
551             twrefcnt += hash(sk, tw);    //將sk加入到ESTABLISHED hash表中，將tw_sock從這個表中摘出
552         }
553         if (tw)
554             twrefcnt += inet_twsk_bind_unhash(tw, hinfo);  //將tw_sock從bind hash表中摘出
555         spin_unlock(&head->lock);
556 
557         if (tw) {
558             inet_twsk_deschedule(tw, death_row);  //釋放tw_sock
559             while (twrefcnt) {
560                 twrefcnt--;
561                 inet_twsk_put(tw);
562             }
563         }
564 
565         ret = 0;
566         goto out;
567     }
568 
569     head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
570     tb  = inet_csk(sk)->icsk_bind_hash;
571     spin_lock_bh(&head->lock);
572     if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {    //綁定到這個IP|port對的只有這一個socket
573         hash(sk, NULL);
574         spin_unlock_bh(&head->lock);
575         return 0;
576     } else {
577         spin_unlock(&head->lock);
578         /* No definite answer... Walk to established hash table */
579         ret = check_established(death_row, sk, snum, NULL);
580 out:
581         local_bh_enable();
582         return ret;
583     }
584 }

要綁定的端口非0情況（1）纔會發生，這時意味着應用進程在調用connect系統調用之前已經成功地使用了bind系統調用，既然bind時不衝突，那麼在connect時直接將tw_sock釋放即可。而情況（1）沒有發生時，tw_sock也會被釋放並從hash表中摘出。

tcp_death_row的定義爲：

 35 struct inet_timewait_death_row tcp_death_row = {
 36     .sysctl_max_tw_buckets = NR_FILE * 2,
 37     .period     = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
 38     .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
 39     .hashinfo   = &tcp_hashinfo,
 40     .tw_timer   = TIMER_INITIALIZER(inet_twdr_hangman, 0,
 41                         (unsigned long)&tcp_death_row),
 42     .twkill_work    = __WORK_INITIALIZER(tcp_death_row.twkill_work,
 43                          inet_twdr_twkill_work),
 44 /* Short-time timewait calendar */
 45 
 46     .twcal_hand = -1,
 47     .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
 48                         (unsigned long)&tcp_death_row),
 49 };

inet_twsk_schedule函數：

340 void inet_twsk_schedule(struct inet_timewait_sock *tw,
341                struct inet_timewait_death_row *twdr,
342                const int timeo, const int timewait_len)
343 {           
344     struct hlist_head *list;
345     int slot;
346             
...     //計算tw sock加入到time_wait定時器鏈表中的位置,slot越大則超時時間越長
371     slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
372 
373     spin_lock(&twdr->death_lock);
374
375     /* Unlink it, if it was scheduled */
376     if (inet_twsk_del_dead_node(tw))//已經在time_wait定時器鏈表中了，則摘除
377         twdr->tw_count--;
378     else
379         atomic_inc(&tw->tw_refcnt);
380
381     if (slot >= INET_TWDR_RECYCLE_SLOTS) {   //超時時間過長，使用慢速定時器
382         /* Schedule to slow timer */
383         if (timeo >= timewait_len) {
384             slot = INET_TWDR_TWKILL_SLOTS - 1;
385         } else {
386             slot = DIV_ROUND_UP(timeo, twdr->period);
387             if (slot >= INET_TWDR_TWKILL_SLOTS)
388                 slot = INET_TWDR_TWKILL_SLOTS - 1;
389         }
390         tw->tw_ttd = jiffies + timeo;
391         slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
392         list = &twdr->cells[slot];  //添加tw_sock到twdr->cells中
393     } else {  //超時時間短的都放入再生定時器中
394         tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
395
396         if (twdr->twcal_hand < 0) { //再生定時器未設置或已經超時
397             twdr->twcal_hand = 0;
398             twdr->twcal_jiffie = jiffies;  //記錄初次設置定時器的時間
399             twdr->twcal_timer.expires = twdr->twcal_jiffie +
400                           (slot << INET_TWDR_RECYCLE_TICK);
401             add_timer(&twdr->twcal_timer);//設置再生定時器
402         } else {
403             if (time_after(twdr->twcal_timer.expires,
404                        jiffies + (slot << INET_TWDR_RECYCLE_TICK)))  //再生定時器未超時
405                 mod_timer(&twdr->twcal_timer,
406                       jiffies + (slot << INET_TWDR_RECYCLE_TICK));//設置再生超時定時器
407             slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
408         }
409         list = &twdr->twcal_row[slot];  //添加tw_sock到twdr->twcal_row中
410     }
411
412     hlist_add_head(&tw->tw_death_node, list);//加入到time_wait定時器鏈表中
413
414     if (twdr->tw_count++ == 0)//加入之前time_wait定時器鏈表中沒有成員
415         mod_timer(&twdr->tw_timer, jiffies + twdr->period); //設置慢速定時器
416     spin_unlock(&twdr->death_lock);
417 }

371：按照超時時間長短劃分slot：0 jiffies爲slot 0，1-2^INET_TWDR_RECYCLE_TICK jiffies爲slot 1，2^INET_TWDR_RECYCLE_TICK + 1 -2^（INET_TWDR_RECYCLE_TICK + 1）爲slot 2...每個slot 的時間長度是2^INET_TWDR_RECYCLE_TICK個jiffies。

386：按照超時時間長短劃分slot，每個slot的時間長度是twdr->period。

可見TIME_WAIT定時器包含2個定時器結構：twcal_timer和tw_timer。其中twcal_timer的超時時間較短，被稱爲“再生定時器”。

tw_timer的超時時間是TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS（即7.5s），刪除的條件有：

（1）在應用進程使用connect系統調用綁定IP|端口時匹配到tw_sock，但判定不衝突時（詳見__inet_twsk_hashdance函數相關分析）；

（2）調用inet_twsk_deschedule刪除一個tw_sock，如果tw隊列中沒有成員，則禁用tw_timer：

326 void inet_twsk_deschedule(struct inet_timewait_sock *tw,
327               struct inet_timewait_death_row *twdr)
328 {   
329     spin_lock(&twdr->death_lock);
330     if (inet_twsk_del_dead_node(tw)) {
331         inet_twsk_put(tw);
332         if (--twdr->tw_count == 0)     //tw隊列爲空
333             del_timer(&twdr->tw_timer);   //刪除tw_timer 
334     }   
335     spin_unlock(&twdr->death_lock);
336     __inet_twsk_kill(tw, twdr->hashinfo);
337 }

__inet_twsk_kill會將tw_sock從bind hash表和ESTABLISHED hash表中刪除：

 70 static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 71                  struct inet_hashinfo *hashinfo)
 72 {   
 73     struct inet_bind_hashbucket *bhead;
 74     int refcnt;
 75     /* Unlink from established hashes. */
 76     spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
 77 
 78     spin_lock(lock);
 79     refcnt = inet_twsk_unhash(tw);    //從ESTABLISHED hash表中刪除
 80     spin_unlock(lock);
 81     
 82     /* Disassociate with bind bucket. */
 83     bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 84             hashinfo->bhash_size)];
 85         
 86     spin_lock(&bhead->lock);
 87     refcnt += inet_twsk_bind_unhash(tw, hashinfo);   //從bind hash表中刪除
 88     spin_unlock(&bhead->lock);
 89     
 90 #ifdef SOCK_REFCNT_DEBUG
 91     if (atomic_read(&tw->tw_refcnt) != 1) {
 92         pr_debug("%s timewait_sock %p refcnt=%d\n",
 93              tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
 94     }
 95 #endif
 96     while (refcnt) {
 97         inet_twsk_put(tw);
 98         refcnt--;
 99     }
100 }

（3）twcal_timer超時時調用inet_twdr_twcal_tick刪除tw_sock，如果tw隊列中沒有成員，則禁用tw_timer.

再生定時器不會被刪除，其超時時間爲slot * 2^INET_TWDR_RECYCLE_TICK。INET_TWDR_RECYCLE_TICK的定義如下：

 41 #if HZ <= 16 || HZ > 4096
 42 # error Unsupported: HZ <= 16 or HZ > 4096
 43 #elif HZ <= 32
 44 # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 45 #elif HZ <= 64
 46 # define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 47 #elif HZ <= 128
 48 # define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 49 #elif HZ <= 256
 50 # define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 51 #elif HZ <= 512
 52 # define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 53 #elif HZ <= 1024
 54 # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 55 #elif HZ <= 2048
 56 # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 57 #else
 58 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
 59 #endif

如果jiffies每1ms加1，則INET_TWDR_RECYCLE_TICK的值爲7；如果timo的值爲60s（通常是最大值），則slot的值爲469，那麼再生定時器的最大超時時間爲60s.如果1ms <= timeo <= 128ms，則slot = 1，再生定時器的最小超時時間爲127ms.

9.10.3 What

twcal_timer對應的超時函數是inet_twdr_twcal_tick：

420 void inet_twdr_twcal_tick(unsigned long data)
421 {
422     struct inet_timewait_death_row *twdr;
423     int n, slot;
424     unsigned long j;
425     unsigned long now = jiffies;
426     int killed = 0;
427     int adv = 0;
428
429     twdr = (struct inet_timewait_death_row *)data;
430
431     spin_lock(&twdr->death_lock);
432     if (twdr->twcal_hand < 0)//再生超時定時器未設置或已經超時
433         goto out;
434
435     slot = twdr->twcal_hand;
436     j = twdr->twcal_jiffie;  //獲取初次設置定時器的時間
437
438     for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {  //遍歷所有時隙
439         if (time_before_eq(j, now)) { //已經超時
440             struct hlist_node *safe;
441             struct inet_timewait_sock *tw;
442
443             inet_twsk_for_each_inmate_safe(tw, safe,
444                                &twdr->twcal_row[slot]) {   //遍歷一個時隙中的所有節點   
445                 __inet_twsk_del_dead_node(tw); //刪除定時節點
446                 __inet_twsk_kill(tw, twdr->hashinfo);//將tw sock移出TCP ESTABLISH hash表
...
450                 inet_twsk_put(tw);             
451                 killed++;  //記錄已刪除的節點的數量
452             }
453         } else {//尚未超時
454             if (!adv) {   
455                 adv = 1;  
456                 twdr->twcal_jiffie = j;     //更新尚未超時的時間起點   
457                 twdr->twcal_hand = slot;    //更新尚未超時的時隙起點   
458             }
459
460             if (!hlist_empty(&twdr->twcal_row[slot])) {
461                 mod_timer(&twdr->twcal_timer, j);
462                 goto out;
463             }
464         }
465         j += 1 << INET_TWDR_RECYCLE_TICK;
466         slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);  //進入下一個時隙
467     }
468     twdr->twcal_hand = -1;   //標記再生定時器已經超時
469
470 out:
471     if ((twdr->tw_count -= killed) == 0)
472         del_timer(&twdr->tw_timer);
473 #ifndef CONFIG_NET_NS
474     NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
475 #endif
476     spin_unlock(&twdr->death_lock);
477 }

439-451：再生定時器會將所有落入相同時隙（slot）的節點做同樣的對待，它的基本動作是超時則刪除，否則再次設置再生定時器

慢速定時器tw_timer對應的超時函數是inet_twdr_hangman：

262 void inet_twdr_hangman(unsigned long data)
263 {
264     struct inet_timewait_death_row *twdr;
265     unsigned int need_timer;
266
267     twdr = (struct inet_timewait_death_row *)data;
268     spin_lock(&twdr->death_lock);
269
270     if (twdr->tw_count == 0)  //沒有tw_sock
271         goto out;
272
273     need_timer = 0;
274     if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { //刪除慢速定時器鏈表中的節點及其對應的tw_sock
275         twdr->thread_slots |= (1 << twdr->slot);  //將當前slot的值標記下來
276         schedule_work(&twdr->twkill_work); //若殺死了過多的tw_sock，則將沒有刪除完畢則將任務放入工作者隊列中由工作者進程完成
277         need_timer = 1;
278     } else {  //沒有殺死過多的tw_sock
279         /* We purged the entire slot, anything left?  */
280         if (twdr->tw_count)  //還有tw_sock
281             need_timer = 1;  //還要繼續設置tw_timer
282         twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));  //進入下一個slot
283     }
284     if (need_timer)
285         mod_timer(&twdr->tw_timer, jiffies + twdr->period);
286 out:
287     spin_unlock(&twdr->death_lock);
288

inet_twdr_hangman每次超時只處理一個slot，然後再設置tw_timer在經過twdr->period的時間後再超時處理下一個slot。由於相鄰slot的超時時間差正好是一個twdr->period，故所有slot都能得到及時的處理。

inet_twdr_do_twkill_work函數刪除慢速定時器鏈表中的節點及其對應的tw_sock：

215 static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
216                     const int slot)
217 {
218     struct inet_timewait_sock *tw;
219     unsigned int killed;
220     int ret;
221
222     /* NOTE: compare this to previous version where lock
223      * was released after detaching chain. It was racy,
224      * because tw buckets are scheduled in not serialized context
225      * in 2.3 (with netfilter), and with softnet it is common, because
226      * soft irqs are not sequenced.
227      */
228     killed = 0;
229     ret = 0;
230 rescan:
231     inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { //遍歷慢速超時隊列
232         __inet_twsk_del_dead_node(tw);
233         spin_unlock(&twdr->death_lock);
234         __inet_twsk_kill(tw, twdr->hashinfo);
235 #ifdef CONFIG_NET_NS
236         NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
237 #endif
238         inet_twsk_put(tw);
239         killed++;       
240         spin_lock(&twdr->death_lock);
241         if (killed > INET_TWDR_TWKILL_QUOTA) {  //殺戮過重
242             ret = 1;
243             break;
244         }
245     
246         /* While we dropped twdr->death_lock, another cpu may have
247          * killed off the next TW bucket in the list, therefore
248          * do a fresh re-read of the hlist head node with the
249          * lock reacquired.  We still use the hlist traversal
250          * macro in order to get the prefetches.
251          */
252         goto rescan;
253     }
254
255     twdr->tw_count -= killed;
256 #ifndef CONFIG_NET_NS
257     NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
258 #endif
259     return ret;
260 }

inet_twdr_twkill_work函數是twdr->twkill_work對應的工作者線程處理函數，用於將inet_twdr_do_twkill_work函數未完成的屠殺進行到底：

291 void inet_twdr_twkill_work(struct work_struct *work)
292 {   
293     struct inet_timewait_death_row *twdr =
294         container_of(work, struct inet_timewait_death_row, twkill_work);
295     int i;
296     
297     BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
298             (sizeof(twdr->thread_slots) * 8));
299                          
300     while (twdr->thread_slots) {
301         spin_lock_bh(&twdr->death_lock);
302         for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
303             if (!(twdr->thread_slots & (1 << i))) //slot i不需要處理
304                 continue;
305
306             while (inet_twdr_do_twkill_work(twdr, i) != 0) {  //循環一直到殺光爲止
307                 if (need_resched()) {
308                     spin_unlock_bh(&twdr->death_lock);
309                     schedule();
310                     spin_lock_bh(&twdr->death_lock);
311                 }
312             }
313         
314             twdr->thread_slots &= ~(1 << i);  //已經殺光此slot了
315         }
316         spin_unlock_bh(&twdr->death_lock);
317     }
318 }

問題：慢速定時器超時時如果釋放的tw_sock超出限制爲什麼要將任務轉移到工作者線程中完成呢？

答案（個人理解）：Linux定時器是在軟中斷上下文執行，如果運行時間過長會導致當前CPU的其它任務無法執行，有違公平性。而工作者線程的優先級較低，運行的時間長一點也沒關係。

Remy1119

發佈了79 篇原創文章 · 獲贊 46 · 訪問量 22萬+

私信關注

9.10 TIME_WAIT定時器

9.10.1 Why

9.10.2 When

9.10.3 What

ollama使用

Window 安裝 Python 失敗 0x80070643，發生嚴重錯誤

TiDB Vector 太香啦：以圖搜圖初體驗！

《最新出爐》系列入門篇-Python+Playwright自動化測試-41-錄製視頻

11.3 TCP內核同步

10.2 發送緩存管理

9.6 堅持（Persist）定時器

OpenSSL-Async mode

12.2 擁塞控制簡介

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結