10.3 接收緩存管理

       TCP收到對端發送的數據後,通常不能立即交付應用進程。在應用進程取走數據之前,數據需要保存在接收緩存之中。如果應用進程取數據的速度比TCP從對端收數據的速度慢,則接收緩存中的數據會越來越多。因此在skb被放入接收緩存之前必須檢查接收緩存能容納的內存數,如果超出限制則必須丟棄skb

10.3.1 緩存佔用

        tcp_rcv_established會檢查接收緩存的使用情況:

 5076 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5077             const struct tcphdr *th, unsigned int len)
5078 {           
5079     struct tcp_sock *tp = tcp_sk(sk);
...
5201             if (!eaten) {
...
5205                 if ((int)skb->truesize > sk->sk_forward_alloc) //剩餘空間無法容納skb
5206                     goto step5;//進入慢速路徑
...
5221                 /* Bulk data transfer: receiver */
5222                 eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
5223                               &fragstolen);
...
5265 step5:
...
5275     tcp_data_queue(sk, skb);
...
        tcp_queue_rcv函數:

4244 static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
4245           bool *fragstolen)
4246 {
4247     int eaten;
4248     struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
4249
4250     __skb_pull(skb, hdrlen);
4251     eaten = (tail &&
4252          tcp_try_coalesce(sk, tail, skb, fragstolen)) ? 1 : 0;
4253     tcp_sk(sk)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4254     if (!eaten) {
4255         __skb_queue_tail(&sk->sk_receive_queue, skb);
4256         skb_set_owner_r(skb, sk);      
4257     }
4258     return eaten;
4259 }
        skb_set_owner_r函數:
1995 static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
1996 {         
1997     skb_orphan(skb);
1998     skb->sk = sk;
1999     skb->destructor = sock_rfree;
2000     atomic_add(skb->truesize, &sk->sk_rmem_alloc);
2001     sk_mem_charge(sk, skb->truesize);    //sk->sk_forward_alloc -= size
2002 } 
        在tcp_data_queue函數中:
4300 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4301 {
4302     const struct tcphdr *th = tcp_hdr(skb);
4303     struct tcp_sock *tp = tcp_sk(sk);
4304     int eaten = -1;
4305     bool fragstolen = false;
...
4344         if (eaten <= 0) {
4345 queue_and_out:
4346             if (eaten < 0 &&
4347                 tcp_try_rmem_schedule(sk, skb, skb->truesize))//檢查是否可以佔用接收緩存的skb->truesize大小的空間
4348                 goto drop;
4349
4350             eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
4351         }
...
4415     tcp_data_queue_ofo(sk, skb);
4416 }
        tcp_try_rmem_schedule會試着調整接收緩存空間來接收數據
4061 static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4062                  unsigned int size)             
4063 {
4064     if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||  //已分配內存超過限制
4065         !sk_rmem_schedule(sk, skb, size)) { //接收緩存無法容納size大小的數據
4066
4067         if (tcp_prune_queue(sk) < 0)   //整理接收隊列
4068             return -1;
4069
4070         if (!sk_rmem_schedule(sk, skb, size)) {  //再次檢查緩存空間是否夠用
4071             if (!tcp_prune_ofo_queue(sk))  //清空亂序隊列,釋放緩存空間
4072                 return -1;
4073
4074             if (!sk_rmem_schedule(sk, skb, size))  //再次檢查緩存空間是否夠用
4075                 return -1;
4076         }
4077     }
4078     return 0;
4079 }
        sk_rmem_schedule用於檢查緩存空間是否夠用:
1375 static inline bool
1376 sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
1377 {                
1378     if (!sk_has_account(sk))
1379         return true;
1380     return size<= sk->sk_forward_alloc ||  //剩餘預分配內存夠用
1381         __sk_mem_schedule(sk, size, SK_MEM_RECV) || //<span style="color:#000000;">增加預分配內存和已分配內存</span>
1382         skb_pfmemalloc(skb);  //skb中的內存是用PFMEMALLOC方式申請的,這種方式申請的是緊急內存
1383 }         
        tcp_prune_queuetcp_prune_ofo_queue分別用於整理接收隊列和亂序隊列:
<span style="color:#000000;">4594 static bool tcp_prune_ofo_queue(struct sock *sk)
4595 {
4596     struct tcp_sock *tp = tcp_sk(sk);
4597     bool res = false;
4598 
4599     if (!skb_queue_empty(&tp->out_of_order_queue)) {
4600         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
4601         __skb_queue_purge(&tp->out_of_order_queue);  //釋放亂序隊列中的所有數據
4602 
4603         /* Reset SACK state.  A conforming SACK implementation will
4604          * do the same at a timeout based retransmit.  When a connection
4605          * is in a sad state like this, we care only about integrity
4606          * of the connection not performance.
4607          */
4608         if (tp->rx_opt.sack_ok)
4609             tcp_sack_reset(&tp->rx_opt);
4610         sk_mem_reclaim(sk);  //更新緩存空間信息
4611         res = true;
4612     }
4613     return res;
4614 }
...</span>
4623 static int tcp_prune_queue(struct sock *sk)
4624 {       
4625     struct tcp_sock *tp = tcp_sk(sk);
4626
4627     SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
4628
4629     NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
4630     
4631     if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
4632         tcp_clamp_window(sk);  //試圖縮小接收緩存大小並更新最大通告窗口大小
4633     else if (sk_under_memory_pressure(sk))
4634         tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); //縮小最大通告窗口大小
4635
4636     tcp_collapse_ofo_queue(sk);  //合併亂序隊列中連續的數據塊以節省空間
4637     if (!skb_queue_empty(&sk->sk_receive_queue))
4638         tcp_collapse(sk, &sk->sk_receive_queue,
4639                  skb_peek(&sk->sk_receive_queue),
4640                  NULL,
4641                  tp->copied_seq, tp->rcv_nxt); //合併接收隊列中未被讀取的數據
4642     sk_mem_reclaim(sk);  //更新緩存空間信息
4643
4644     if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) //接收緩存夠用了
4645         return 0;
4646
4647     /* Collapsing did not help, destructive actions follow.
4648      * This must not ever occur. */
4649     
4650     tcp_prune_ofo_queue(sk); //清理亂序隊列
...
4662     tp->pred_flags = 0;  //內存緊張,禁用快速處理路徑
4663     return -1;
4664 }
        tcp_clamp_window用於更新最大通告窗口大小
 410 static void tcp_clamp_window(struct sock *sk)
 411 {   
 412     struct tcp_sock *tp = tcp_sk(sk);
 413     struct inet_connection_sock *icsk = inet_csk(sk);
 414     
 415     icsk->icsk_ack.quick = 0;
 416
 417     if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&  //接收緩存大小小於最大接收緩存大小
 418         !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&  //應用進程沒有設置接收緩存大小
 419         !sk_under_memory_pressure(sk) &&  //不處於內存壓力之下
 420         sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { //全局已分配TCP內存小於最低限制
 421         sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
 422                     sysctl_tcp_rmem[2]);
 423     }
 424     if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
 425         tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
 426 }
        當剩餘預分配內存比較多時sk_mem_reclaim函數會回收一部分預分配內存
1385 static inline void sk_mem_reclaim(struct sock *sk)
1386 {   
1387     if (!sk_has_account(sk))
1388         return;
1389     if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
1390         __sk_mem_reclaim(sk);
1391 }       
        __sk_mem_reclaim:
2005 void __sk_mem_reclaim(struct sock *sk)
2006 {       
2007     sk_memory_allocated_sub(sk,
2008                 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2009     sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2010     
2011     if (sk_under_memory_pressure(sk) &&
2012         (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2013         sk_leave_memory_pressure(sk);
2014 }
        TCP調用tcp_data_queue_ofo函數將skb放入亂序隊列時也會使用skb_set_owner_r函數更新接收緩存信息:
4121 static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4122 {       
4123     struct tcp_sock *tp = tcp_sk(sk);
4124     struct sk_buff *skb1;
4125     u32 seq, end_seq;
...
4239 end:
4240     if (skb)
4241         skb_set_owner_r(skb, sk);
4242 }

10.3.2 緩存釋放

        應用進程在tcp_sendmsg函數中將數據讀完畢後,接收緩存中的skb就會被釋放。skb釋放時會調用skb_set_owner_r函數中設置的sock_rfree函數:

1560 void sock_rfree(struct sk_buff *skb)
1561 {            
1562     struct sock *sk = skb->sk;
1563     unsigned int len = skb->truesize;
1564 
1565     atomic_sub(len, &sk->sk_rmem_alloc);
1566     sk_mem_uncharge(sk, len);  //釋放預分配內存
1567 }
        接收緩衝區放入數據或移除數據後,由tcp_rcv_space_adjust函數更新內存信息:
 522 void tcp_rcv_space_adjust(struct sock *sk)
 523 {
 524     struct tcp_sock *tp = tcp_sk(sk);
 525     int time;
 526     int space;
 527
 528     if (tp->rcvq_space.time == 0)
 529         goto new_measure;
 530
 531     time = tcp_time_stamp - tp->rcvq_space.time;
 532     if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
 533         return;
 534
 535     space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
 536
 537     space = max(tp->rcvq_space.space, space);
 538
 539     if (tp->rcvq_space.space != space) {  //有新的數據被應用進程copy出去
 540         int rcvmem;
 541
 542         tp->rcvq_space.space = space;  
 543
 544         if (sysctl_tcp_moderate_rcvbuf &&
 545             !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
 546             int new_clamp = space;     
 547
 548             /* Receive space grows, normalize in order to
 549              * take into account packet headers and sk_buff
 550              * structure overhead.         
 551              */
 552             space /= tp->advmss;
 553             if (!space)
 554                 space = 1;
 555             rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
 556             while (tcp_win_from_space(rcvmem) < tp->advmss)
 557                 rcvmem += 128;
 558             space *= rcvmem;
 559             space = min(space, sysctl_tcp_rmem[2]);
 560             if (space > sk->sk_rcvbuf) {
 561                 sk->sk_rcvbuf = space;
 562
 563                 /* Make the window clamp follow along.  */
 564                 tp->window_clamp = new_clamp;
 565             }
 566         }
 567     }
 568
 569 new_measure:
 570     tp->rcvq_space.seq = tp->copied_seq;
 571     tp->rcvq_space.time = tcp_time_stamp;
 572 }
        上面我們瞭解了接收隊列和亂序隊列的管理,接下來看看其它類型的接收隊列(prequeue隊列、異步等待隊列、backlog隊列)的緩存管理。

10.3.3 其它隊列

(1)prequeue隊列:

1919 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1920 {            
1921     struct tcp_sock *tp = tcp_sk(sk);
1922     
1923     if (sysctl_tcp_low_latency || !tp->ucopy.task)
1924         return false;
1925     
1926     if (skb->len <= tcp_hdrlen(skb) &&
1927         skb_queue_len(&tp->ucopy.prequeue) == 0)
1928         return false;
1929 
1930     skb_dst_force(skb);
1931     __skb_queue_tail(&tp->ucopy.prequeue, skb);
1932     tp->ucopy.memory += skb->truesize;
1933     if (tp->ucopy.memory > sk->sk_rcvbuf) {
1934         struct sk_buff *skb1;
1935         
1936         BUG_ON(sock_owned_by_user(sk));
1937 
1938         while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1939             sk_backlog_rcv(sk, skb1);
1940             NET_INC_STATS_BH(sock_net(sk),
1941                      LINUX_MIB_TCPPREQUEUEDROPPED);
1942         }
1943 
1944         tp->ucopy.memory = 0;
...
        可見prequeue隊列的緩存管理很簡單:超出限制則刪除全部數據。

(2)異步等待隊列:放入這個隊列中的skb不納入緩存管理。

(3)backlog隊列:

1961 int tcp_v4_rcv(struct sk_buff *skb)
1962 {
...
2039     } else if (unlikely(sk_add_backlog(sk, skb,
2040                        sk->sk_rcvbuf + sk->sk_sndbuf))) {
...
        sk_add_backlog函數會檢查發送緩存和接收緩存的和是否有可用空間:

 768 static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb,
 769                      unsigned int limit)
 770 {   
 771     unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
 772         
 773     return qsize > limit;
 774 }       
 775     
 776 /* The per-socket spinlock must be held here. */
 777 static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
 778                           unsigned int limit)
 779 {
 780     if (sk_rcvqueues_full(sk, skb, limit))
 781         return -ENOBUFS;
 782 
 783     __sk_add_backlog(sk, skb);
 784     sk->sk_backlog.len += skb->truesize;
 785     return 0;
 786 }   


發佈了79 篇原創文章 · 獲贊 46 · 訪問量 22萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章