9.10.1 Why
當socekt進入TIME_WAIT狀態後,TIME_WAIT定時器啓動。在超時之前,替代socket的tw sock會處理舊連接中的包,阻止其危害新連接。定時器超時後,tw sock被刪除,並釋放其佔用的端口號。
9.10.2 When
TIME_WAIT定時器的安裝由tcp_time_wait函數完成,調用tcp_time_wait函數的時機有:
(1)在TCP_FIN_WAIT2狀態下socket關閉,沒有用TCP_LINGER2選項將tp->linger2設置爲小於0且tcp_fin_time的大小小於等於TCP_TIMEWAIT_LEN:
2059 void tcp_close(struct sock *sk, long timeout)
2060 {
...
2183 if (sk->sk_state == TCP_FIN_WAIT2) {
2184 struct tcp_sock *tp = tcp_sk(sk);
2185 if (tp->linger2 < 0) {
...
2190 } else {
2191 const int tmo = tcp_fin_time(sk);
2192
2193 if (tmo > TCP_TIMEWAIT_LEN) {
2194 inet_csk_reset_keepalive_timer(sk,
2195 tmo - TCP_TIMEWAIT_LEN);
2196 } else {
2197 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2198 goto out;
2199 }
2200 }
...
(2)TCP_FIN_WAIT2狀態下收到FIN併發送ACK後:
3783 static void tcp_fin(struct sock *sk)
3784 {
...
3818 case TCP_FIN_WAIT2:
3819 /* Received a FIN -- send ACK and enter TIME_WAIT. */
3820 tcp_send_ack(sk);
3821 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
(3)孤兒socket在TCP_FIN_WAIT1狀態下收到ACK時,滿足:
1)沒有用TCP_LINGER2選項將tp->linger2設置爲小於0
2)tcp_fin_time的大小小於等於TCP_TIMEWAIT_LEN:
3)ACK中沒有數據或數據全是舊的
4)ACK中沒有FIN標記並且socket沒有被應用進程鎖定
5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
...
5751 case TCP_FIN_WAIT1:
...
5780 if (!sock_flag(sk, SOCK_DEAD))
5781 /* Wake up lingering close() */
5782 sk->sk_state_change(sk);
5783 else {
5784 int tmo;
5785
5786 if (tp->linger2 < 0 ||
5787 (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
5788 after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
5789 tcp_done(sk);
5790 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
5791 return 1;
5792 }
5793
5794 tmo = tcp_fin_time(sk);
5795 if (tmo > TCP_TIMEWAIT_LEN) {
5796 inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
5797 } else if (th->fin || sock_owned_by_user(sk)) {
5798 /* Bad case. We could lose such FIN otherwise.
5799 * It is not a big problem, but it looks confusing
5800 * and not so rare event. We still can lose it now,
5801 * if it spins in bh_lock_sock(), but it is really
5802 * marginal case.
5803 */
5804 inet_csk_reset_keepalive_timer(sk, tmo);
5805 } else {
5806 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
5807 goto discard;
5808 }
...
(4)TCP在TCP_CLOSING狀態下收到ACK時:5600 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5601 const struct tcphdr *th, unsigned int len)
5602 {
...
5813 case TCP_CLOSING:
5814 if (tp->snd_una == tp->write_seq) {
5815 tcp_time_wait(sk, TCP_TIME_WAIT, 0);
5816 goto discard;
5817 }
...
(5)FIN_WAIT2定時器超時時,沒有用TCP_LINGER2選項將tp->linger2設置爲小於0且tcp_fin_time的大小大於TCP_TIMEWAIT_LEN:558 static void tcp_keepalive_timer (unsigned long data)
559 {
...
578 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
579 if (tp->linger2 >= 0) {
580 const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
581
582 if (tmo > 0) {
583 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
584 goto out;
585 }
...
tcp_time_wait函數會調用inet_twsk_schedule函數安裝TIME_WAIT定時器:266 void tcp_time_wait(struct sock *sk, int state, int timeo)
267 {
...
327 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); //將tw sock放入ESTABLESHED hash表和bind hash表中,將sk從ESTABLISHED hash表中移除
328
329 /* Get the TIME_WAIT timeout firing. */
330 if (timeo < rto)
331 timeo = rto;
332
333 if (recycle_ok) {
334 tw->tw_timeout = rto;
335 } else {
336 tw->tw_timeout = TCP_TIMEWAIT_LEN;
337 if (state == TCP_TIME_WAIT)
338 timeo = TCP_TIMEWAIT_LEN;
339 }
340
341 inet_twsk_schedule(tw, &tcp_death_row, timeo,
342 TCP_TIMEWAIT_LEN);
343 inet_twsk_put(tw);
...
__inet_twsk_hashdance函數將tw_sock加入到bind hash表和ESTABLISHED表中,這樣在tw_sock被刪除之前相應IP|端口不允許bind,也不允許建立:
126 void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
127 struct inet_hashinfo *hashinfo)
128 {
129 const struct inet_sock *inet = inet_sk(sk);
130 const struct inet_connection_sock *icsk = inet_csk(sk);
131 struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
132 spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
133 struct inet_bind_hashbucket *bhead;
134 /* Step 1: Put TW into bind hash. Original socket stays there too.
135 Note, that any socket with inet->num != 0 MUST be bound in
136 binding cache, even if it is closed.
137 */
138 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
139 hashinfo->bhash_size)];
140 spin_lock(&bhead->lock);
141 tw->tw_tb = icsk->icsk_bind_hash;
142 WARN_ON(!icsk->icsk_bind_hash);
143 inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); //加入到bind hash表中
144 spin_unlock(&bhead->lock);
145
146 spin_lock(lock);
...
153 inet_twsk_add_node_rcu(tw, &ehead->twchain); //加入到ESBABLISHED hash表中
154
155 /* Step 3: Remove SK from established hash. */
156 if (__sk_nulls_del_node_init_rcu(sk))
157 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
...
167 atomic_add(1 + 1 + 1, &tw->tw_refcnt);
168
169 spin_unlock(lock);
170 }
這樣,在應用進程使用bind系統調用綁定與tw_sock相同的IP|端口對時內核會用到inet_csk_bind_conflict函數,但由於成功匹配到bind hash表中的tw_sock,會導致衝突,無法bind(詳見2.2 Bind系統調用)。而在建立連接時,inet_hash_connect函數會調用__inet_check_established檢查即將建立的連接是否與已建立的連接衝突:311 static int __inet_check_established(struct inet_timewait_death_row *death_row,
312 struct sock *sk, __u16 lport,
313 struct inet_timewait_sock **twp)
314 {
...
335 sk_nulls_for_each(sk2, node, &head->twchain) {
336 if (sk2->sk_hash != hash)
337 continue;
338
339 if (likely(INET_TW_MATCH(sk2, net, acookie,
340 saddr, daddr, ports, dif))) { //地址|端口匹配
341 tw = inet_twsk(sk2);
342 if (twsk_unique(sk, sk2, twp)) //調用tcp_twsk_unique判斷是否衝突
343 goto unique; //不衝突
344 else
345 goto not_unique; //衝突
346 }
347 }
348 tw = NULL;
...
359 unique:
...
376 if (twp) {
377 *twp = tw; //交給調用者處理
378 } else if (tw) {
379 /* Silly. Should hash-dance instead... */
380 inet_twsk_deschedule(tw, death_row);
381
382 inet_twsk_put(tw);
383 }
384 return 0;
385
386 not_unique:
387 spin_unlock(lock);
388 return -EADDRNOTAVAIL;
389 }
tcp_twsk_unique函數 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 struct tcp_sock *tp = tcp_sk(sk);
...
125 if (tcptw->tw_ts_recent_stamp && //開啓時間戳選項且在TIME_WAIT狀態下收到過包
126 (twp == NULL || (sysctl_tcp_tw_reuse &&
127 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 if (tp->write_seq == 0)
130 tp->write_seq = 1;
131 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
132 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133 sock_hold(sktw);
134 return 1;
135 }
136
137 return 0;
138 }
可見,當:
(1)__inet_check_established函數的調用者不需要返回tw_sock的時候(即twp == NULL爲真),或
(2)應用進程設置了net.ipv4.tcp_tw_reuse內核選項允許tw_sock重用時,
tcp_twsk_unique函數會返回1,即不衝突。不衝突時如果是(1),則__inet_check_established函數會釋放tw_sock;否則會將tw_sock返回給調用者inet_hash_connect函數處理。在不衝突時,情況(1)發生時到底意味着什麼?情況(1)沒有發生時inet_hash_connect函數用tw_sock幹什麼?來看代碼:
589 int inet_hash_connect(struct inet_timewait_death_row *death_row,
590 struct sock *sk)
591 {
592 return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
593 __inet_check_established, __inet_hash_nolisten);
594 }
看來__inet_check_established函數的使用者是__inet_hash_connect函數:477 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
478 struct sock *sk, u32 port_offset,
479 int (*check_established)(struct inet_timewait_death_row *,
480 struct sock *, __u16, struct inet_timewait_sock **),
481 int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
482 {
...
491 if (!snum) {
...
520 if (!check_established(death_row, sk,
521 port, &tw))
522 goto ok;
...
544 ok:
545 hint += i;
546
547 /* Head lock still held and bh's disabled */
548 inet_bind_hash(sk, tb, port);
549 if (sk_unhashed(sk)) {
550 inet_sk(sk)->inet_sport = htons(port);
551 twrefcnt += hash(sk, tw); //將sk加入到ESTABLISHED hash表中,將tw_sock從這個表中摘出
552 }
553 if (tw)
554 twrefcnt += inet_twsk_bind_unhash(tw, hinfo); //將tw_sock從bind hash表中摘出
555 spin_unlock(&head->lock);
556
557 if (tw) {
558 inet_twsk_deschedule(tw, death_row); //釋放tw_sock
559 while (twrefcnt) {
560 twrefcnt--;
561 inet_twsk_put(tw);
562 }
563 }
564
565 ret = 0;
566 goto out;
567 }
568
569 head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
570 tb = inet_csk(sk)->icsk_bind_hash;
571 spin_lock_bh(&head->lock);
572 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { //綁定到這個IP|port對的只有這一個socket
573 hash(sk, NULL);
574 spin_unlock_bh(&head->lock);
575 return 0;
576 } else {
577 spin_unlock(&head->lock);
578 /* No definite answer... Walk to established hash table */
579 ret = check_established(death_row, sk, snum, NULL);
580 out:
581 local_bh_enable();
582 return ret;
583 }
584 }
要綁定的端口非0情況(1)纔會發生,這時意味着應用進程在調用connect系統調用之前已經成功地使用了bind系統調用,既然bind時不衝突,那麼在connect時直接將tw_sock釋放即可。而情況(1)沒有發生時,tw_sock也會被釋放並從hash表中摘出。
tcp_death_row的定義爲:
35 struct inet_timewait_death_row tcp_death_row = {
36 .sysctl_max_tw_buckets = NR_FILE * 2,
37 .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
38 .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
39 .hashinfo = &tcp_hashinfo,
40 .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
41 (unsigned long)&tcp_death_row),
42 .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
43 inet_twdr_twkill_work),
44 /* Short-time timewait calendar */
45
46 .twcal_hand = -1,
47 .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
48 (unsigned long)&tcp_death_row),
49 };
inet_twsk_schedule函數:340 void inet_twsk_schedule(struct inet_timewait_sock *tw,
341 struct inet_timewait_death_row *twdr,
342 const int timeo, const int timewait_len)
343 {
344 struct hlist_head *list;
345 int slot;
346
... //計算tw sock加入到time_wait定時器鏈表中的位置,slot越大則超時時間越長
371 slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
372
373 spin_lock(&twdr->death_lock);
374
375 /* Unlink it, if it was scheduled */
376 if (inet_twsk_del_dead_node(tw))//已經在time_wait定時器鏈表中了,則摘除
377 twdr->tw_count--;
378 else
379 atomic_inc(&tw->tw_refcnt);
380
381 if (slot >= INET_TWDR_RECYCLE_SLOTS) { //超時時間過長,使用慢速定時器
382 /* Schedule to slow timer */
383 if (timeo >= timewait_len) {
384 slot = INET_TWDR_TWKILL_SLOTS - 1;
385 } else {
386 slot = DIV_ROUND_UP(timeo, twdr->period);
387 if (slot >= INET_TWDR_TWKILL_SLOTS)
388 slot = INET_TWDR_TWKILL_SLOTS - 1;
389 }
390 tw->tw_ttd = jiffies + timeo;
391 slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
392 list = &twdr->cells[slot]; //添加tw_sock到twdr->cells中
393 } else { //超時時間短的都放入再生定時器中
394 tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK);
395
396 if (twdr->twcal_hand < 0) { //再生定時器未設置或已經超時
397 twdr->twcal_hand = 0;
398 twdr->twcal_jiffie = jiffies; //記錄初次設置定時器的時間
399 twdr->twcal_timer.expires = twdr->twcal_jiffie +
400 (slot << INET_TWDR_RECYCLE_TICK);
401 add_timer(&twdr->twcal_timer);//設置再生定時器
402 } else {
403 if (time_after(twdr->twcal_timer.expires,
404 jiffies + (slot << INET_TWDR_RECYCLE_TICK))) //再生定時器未超時
405 mod_timer(&twdr->twcal_timer,
406 jiffies + (slot << INET_TWDR_RECYCLE_TICK));//設置再生超時定時器
407 slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
408 }
409 list = &twdr->twcal_row[slot]; //添加tw_sock到twdr->twcal_row中
410 }
411
412 hlist_add_head(&tw->tw_death_node, list);//加入到time_wait定時器鏈表中
413
414 if (twdr->tw_count++ == 0)//加入之前time_wait定時器鏈表中沒有成員
415 mod_timer(&twdr->tw_timer, jiffies + twdr->period); //設置慢速定時器
416 spin_unlock(&twdr->death_lock);
417 }
371:按照超時時間長短劃分slot:0 jiffies爲slot 0,1-2^INET_TWDR_RECYCLE_TICK jiffies爲slot 1,2^INET_TWDR_RECYCLE_TICK + 1 -2^(INET_TWDR_RECYCLE_TICK + 1)爲slot 2...每個slot 的時間長度是2^INET_TWDR_RECYCLE_TICK個jiffies。
386:按照超時時間長短劃分slot,每個slot的時間長度是twdr->period。
可見TIME_WAIT定時器包含2個定時器結構:twcal_timer和tw_timer。其中twcal_timer的超時時間較短,被稱爲“再生定時器”。
tw_timer的超時時間是TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS(即7.5s),刪除的條件有:
(1)在應用進程使用connect系統調用綁定IP|端口時匹配到tw_sock,但判定不衝突時(詳見__inet_twsk_hashdance函數相關分析);
(2)調用inet_twsk_deschedule刪除一個tw_sock,如果tw隊列中沒有成員,則禁用tw_timer:
326 void inet_twsk_deschedule(struct inet_timewait_sock *tw,
327 struct inet_timewait_death_row *twdr)
328 {
329 spin_lock(&twdr->death_lock);
330 if (inet_twsk_del_dead_node(tw)) {
331 inet_twsk_put(tw);
332 if (--twdr->tw_count == 0) //tw隊列爲空
333 del_timer(&twdr->tw_timer); //刪除tw_timer
334 }
335 spin_unlock(&twdr->death_lock);
336 __inet_twsk_kill(tw, twdr->hashinfo);
337 }
__inet_twsk_kill會將tw_sock從bind hash表和ESTABLISHED hash表中刪除:
70 static void __inet_twsk_kill(struct inet_timewait_sock *tw,
71 struct inet_hashinfo *hashinfo)
72 {
73 struct inet_bind_hashbucket *bhead;
74 int refcnt;
75 /* Unlink from established hashes. */
76 spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
77
78 spin_lock(lock);
79 refcnt = inet_twsk_unhash(tw); //從ESTABLISHED hash表中刪除
80 spin_unlock(lock);
81
82 /* Disassociate with bind bucket. */
83 bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
84 hashinfo->bhash_size)];
85
86 spin_lock(&bhead->lock);
87 refcnt += inet_twsk_bind_unhash(tw, hashinfo); //從bind hash表中刪除
88 spin_unlock(&bhead->lock);
89
90 #ifdef SOCK_REFCNT_DEBUG
91 if (atomic_read(&tw->tw_refcnt) != 1) {
92 pr_debug("%s timewait_sock %p refcnt=%d\n",
93 tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
94 }
95 #endif
96 while (refcnt) {
97 inet_twsk_put(tw);
98 refcnt--;
99 }
100 }
(3)twcal_timer超時時調用inet_twdr_twcal_tick刪除tw_sock,如果tw隊列中沒有成員,則禁用tw_timer.
再生定時器不會被刪除,其超時時間爲slot * 2^INET_TWDR_RECYCLE_TICK。INET_TWDR_RECYCLE_TICK的定義如下:
41 #if HZ <= 16 || HZ > 4096
42 # error Unsupported: HZ <= 16 or HZ > 4096
43 #elif HZ <= 32
44 # define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
45 #elif HZ <= 64
46 # define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
47 #elif HZ <= 128
48 # define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
49 #elif HZ <= 256
50 # define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
51 #elif HZ <= 512
52 # define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
53 #elif HZ <= 1024
54 # define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
55 #elif HZ <= 2048
56 # define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
57 #else
58 # define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
59 #endif
如果jiffies每1ms加1,則INET_TWDR_RECYCLE_TICK的值爲7;如果timo的值爲60s(通常是最大值),則slot的值爲469,那麼再生定時器的最大超時時間爲60s.如果1ms <= timeo <= 128ms,則slot = 1,再生定時器的最小超時時間爲127ms.
9.10.3 What
twcal_timer對應的超時函數是inet_twdr_twcal_tick:
420 void inet_twdr_twcal_tick(unsigned long data)
421 {
422 struct inet_timewait_death_row *twdr;
423 int n, slot;
424 unsigned long j;
425 unsigned long now = jiffies;
426 int killed = 0;
427 int adv = 0;
428
429 twdr = (struct inet_timewait_death_row *)data;
430
431 spin_lock(&twdr->death_lock);
432 if (twdr->twcal_hand < 0)//再生超時定時器未設置或已經超時
433 goto out;
434
435 slot = twdr->twcal_hand;
436 j = twdr->twcal_jiffie; //獲取初次設置定時器的時間
437
438 for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { //遍歷所有時隙
439 if (time_before_eq(j, now)) { //已經超時
440 struct hlist_node *safe;
441 struct inet_timewait_sock *tw;
442
443 inet_twsk_for_each_inmate_safe(tw, safe,
444 &twdr->twcal_row[slot]) { //遍歷一個時隙中的所有節點
445 __inet_twsk_del_dead_node(tw); //刪除定時節點
446 __inet_twsk_kill(tw, twdr->hashinfo);//將tw sock移出TCP ESTABLISH hash表
...
450 inet_twsk_put(tw);
451 killed++; //記錄已刪除的節點的數量
452 }
453 } else {//尚未超時
454 if (!adv) {
455 adv = 1;
456 twdr->twcal_jiffie = j; //更新尚未超時的時間起點
457 twdr->twcal_hand = slot; //更新尚未超時的時隙起點
458 }
459
460 if (!hlist_empty(&twdr->twcal_row[slot])) {
461 mod_timer(&twdr->twcal_timer, j);
462 goto out;
463 }
464 }
465 j += 1 << INET_TWDR_RECYCLE_TICK;
466 slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); //進入下一個時隙
467 }
468 twdr->twcal_hand = -1; //標記再生定時器已經超時
469
470 out:
471 if ((twdr->tw_count -= killed) == 0)
472 del_timer(&twdr->tw_timer);
473 #ifndef CONFIG_NET_NS
474 NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
475 #endif
476 spin_unlock(&twdr->death_lock);
477 }
439-451:再生定時器會將所有落入相同時隙(slot)的節點做同樣的對待,它的基本動作是超時則刪除,否則再次設置再生定時器
慢速定時器tw_timer對應的超時函數是inet_twdr_hangman:
262 void inet_twdr_hangman(unsigned long data)
263 {
264 struct inet_timewait_death_row *twdr;
265 unsigned int need_timer;
266
267 twdr = (struct inet_timewait_death_row *)data;
268 spin_lock(&twdr->death_lock);
269
270 if (twdr->tw_count == 0) //沒有tw_sock
271 goto out;
272
273 need_timer = 0;
274 if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { //刪除慢速定時器鏈表中的節點及其對應的tw_sock
275 twdr->thread_slots |= (1 << twdr->slot); //將當前slot的值標記下來
276 schedule_work(&twdr->twkill_work); //若殺死了過多的tw_sock,則將沒有刪除完畢則將任務放入工作者隊列中由工作者進程完成
277 need_timer = 1;
278 } else { //沒有殺死過多的tw_sock
279 /* We purged the entire slot, anything left? */
280 if (twdr->tw_count) //還有tw_sock
281 need_timer = 1; //還要繼續設置tw_timer
282 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); //進入下一個slot
283 }
284 if (need_timer)
285 mod_timer(&twdr->tw_timer, jiffies + twdr->period);
286 out:
287 spin_unlock(&twdr->death_lock);
288
inet_twdr_hangman每次超時只處理一個slot,然後再設置tw_timer在經過twdr->period的時間後再超時處理下一個slot。由於相鄰slot的超時時間差正好是一個twdr->period,故所有slot都能得到及時的處理。
inet_twdr_do_twkill_work函數刪除慢速定時器鏈表中的節點及其對應的tw_sock:
215 static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
216 const int slot)
217 {
218 struct inet_timewait_sock *tw;
219 unsigned int killed;
220 int ret;
221
222 /* NOTE: compare this to previous version where lock
223 * was released after detaching chain. It was racy,
224 * because tw buckets are scheduled in not serialized context
225 * in 2.3 (with netfilter), and with softnet it is common, because
226 * soft irqs are not sequenced.
227 */
228 killed = 0;
229 ret = 0;
230 rescan:
231 inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) { //遍歷慢速超時隊列
232 __inet_twsk_del_dead_node(tw);
233 spin_unlock(&twdr->death_lock);
234 __inet_twsk_kill(tw, twdr->hashinfo);
235 #ifdef CONFIG_NET_NS
236 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
237 #endif
238 inet_twsk_put(tw);
239 killed++;
240 spin_lock(&twdr->death_lock);
241 if (killed > INET_TWDR_TWKILL_QUOTA) { //殺戮過重
242 ret = 1;
243 break;
244 }
245
246 /* While we dropped twdr->death_lock, another cpu may have
247 * killed off the next TW bucket in the list, therefore
248 * do a fresh re-read of the hlist head node with the
249 * lock reacquired. We still use the hlist traversal
250 * macro in order to get the prefetches.
251 */
252 goto rescan;
253 }
254
255 twdr->tw_count -= killed;
256 #ifndef CONFIG_NET_NS
257 NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
258 #endif
259 return ret;
260 }
inet_twdr_twkill_work函數是twdr->twkill_work對應的工作者線程處理函數,用於將inet_twdr_do_twkill_work函數未完成的屠殺進行到底:291 void inet_twdr_twkill_work(struct work_struct *work)
292 {
293 struct inet_timewait_death_row *twdr =
294 container_of(work, struct inet_timewait_death_row, twkill_work);
295 int i;
296
297 BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
298 (sizeof(twdr->thread_slots) * 8));
299
300 while (twdr->thread_slots) {
301 spin_lock_bh(&twdr->death_lock);
302 for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
303 if (!(twdr->thread_slots & (1 << i))) //slot i不需要處理
304 continue;
305
306 while (inet_twdr_do_twkill_work(twdr, i) != 0) { //循環一直到殺光爲止
307 if (need_resched()) {
308 spin_unlock_bh(&twdr->death_lock);
309 schedule();
310 spin_lock_bh(&twdr->death_lock);
311 }
312 }
313
314 twdr->thread_slots &= ~(1 << i); //已經殺光此slot了
315 }
316 spin_unlock_bh(&twdr->death_lock);
317 }
318 }
問題:慢速定時器超時時如果釋放的tw_sock超出限制爲什麼要將任務轉移到工作者線程中完成呢?
答案(個人理解):Linux定時器是在軟中斷上下文執行,如果運行時間過長會導致當前CPU的其它任務無法執行,有違公平性。而工作者線程的優先級較低,運行的時間長一點也沒關係。