10.2 發送緩存管理

應用進程使用TCP發送的數據會先放入發送緩存中，TCP的發送緩存是一個skb隊列。這個隊列存在的意義是：保證應用進程交付TCP的數據能夠可靠地交付目的端。在收到對端的ACK之前，發送緩存中的數據不能刪除。

10.2.1 使用緩存

對發送緩存的使用是從tcp_sendmsg函數開始的：

 1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017         size_t size)
1018 {           
1019     struct iovec *iov;
1020     struct tcp_sock *tp = tcp_sk(sk);
1021     struct sk_buff *skb;
1022     int iovlen, flags, err, copied = 0;
1023     int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1024     bool sg;
1025     long timeo;
...
1106             if (copy <= 0) {
1107 new_segment:
1108                 /* Allocate new segment. If the interface is SG,
1109                  * allocate skb fitting to single page.
1110                  */
1111                 if (!sk_stream_memory_free(sk)) //檢查已佔用內存是否達到限制
1112                     goto wait_for_sndbuf;
1113
1114                 skb = sk_stream_alloc_skb(sk,
1115                               select_size(sk, sg),
1116                               sk->sk_allocation); //申請內存
1117                 if (!skb)
1118                     goto wait_for_memory;
...
1133                 skb_entail(sk, skb);
...
1149             } else {    //使用skb的非線性區
1150                 bool merge = true;
1151                 int i = skb_shinfo(skb)->nr_frags;
1152                 struct page_frag *pfrag = sk_page_frag(sk);
1153 
1154                 if (!sk_page_frag_refill(sk, pfrag))
1155                     goto wait_for_memory;
...
1168                 if (!sk_wmem_schedule(sk, copy))
1169                     goto wait_for_memory;
1170 
1171                 err = skb_copy_to_page_nocache(sk, from, skb,
1172                                    pfrag->page,
1173                                    pfrag->offset,
1174                                    copy);

sk_stream_memory_free函數：

 743 static inline bool sk_stream_memory_free(const struct sock *sk)
 744 {                             
 745     return sk->sk_wmem_queued < sk->sk_sndbuf;
 746 }

當發送隊列中佔用緩存總數小於發送緩存大小時，則發送緩存尚有剩餘空間。

sk_stream_alloc_skb函數用於申請skb：

 754 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 755 {
 756     struct sk_buff *skb;
 757
 758     /* The TCP header must be at least 32-bit aligned.  */
 759     size = ALIGN(size, 4);
 760
 761     skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
 762     if (skb) {
 763         if (sk_wmem_schedule(sk, skb->truesize)) {     //檢查是否允許使用skb->truesize大小的內存
 764             skb_reserve(skb, sk->sk_prot->max_header);
 765             /*
 766              * Make sure that we have exactly size bytes
 767              * available to the caller, no more, no less.
 768              */
 769             skb->reserved_tailroom = skb->end - skb->tail - size;
 770             return skb;
 771         }
 772         __kfree_skb(skb);
 773     } else {    //內存緊張
 774         sk->sk_prot->enter_memory_pressure(sk);//調用tcp_enter_memory_pressure函數更新Linux MIB，並設置tcp_memory_pressure = 1
 775         sk_stream_moderate_sndbuf(sk);    //縮小發送緩存大小的上限
 776     }
 777     return NULL;
 778 }

sk_wmem_schedule：

1361 static inline bool sk_has_account(struct sock *sk)
1362 {
1363     /* return true if protocol supports memory accounting */
1364     return !!sk->sk_prot->memory_allocated;//指向tcp_memory_allocated，記錄TCP使用的所有內存總數
1365 }   
1366         
1367 static inline bool sk_wmem_schedule(struct sock *sk, int size)
1368 {   
1369     if (!sk_has_account(sk))//對於TCP此判斷不爲真
1370         return true;
1371     return size <= sk->sk_forward_alloc ||
1372         __sk_mem_schedule(sk, size, SK_MEM_SEND);
1373 }

__sk_mem_schedule用於增加預分配內存和已分配內存:

1923 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1924 {
1925     struct proto *prot = sk->sk_prot;
1926     int amt = sk_mem_pages(size);
1927     long allocated;
1928     int parent_status = UNDER_LIMIT;
1929     
1930     sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;  //增加預分配內存
1931     
1932     allocated = sk_memory_allocated_add(sk, amt, &parent_status);    //tcp_memory_allocated += amt
1933         
1934     /* Under limit. */
1935     if (parent_status == UNDER_LIMIT &&
1936             allocated <= sk_prot_mem_limits(sk, 0)) {    //sk_prot_mem_limits(sk, 0)的值等於net.ipv4.tcp_mem[0]內核參數
1937         sk_leave_memory_pressure(sk);
1938         return 1;
1939     }   
1940     
1941     /* Under pressure. (we or our parents) */
1942     if ((parent_status > SOFT_LIMIT) ||
1943             allocated > sk_prot_mem_limits(sk, 1))    //sk_prot_mem_limits(sk, 1)的值等於net.ipv4.tcp_mem[1]內核參數
1944         sk_enter_memory_pressure(sk);    //設置tcp_memory_pressure爲1
1945
1946     /* Over hard limit (we or our parents) */
1947     if ((parent_status == OVER_LIMIT) ||
1948             (allocated > sk_prot_mem_limits(sk, 2)))//sk_prot_mem_limits(sk, 2)的值等於net.ipv4.tcp_mem[2]內核參數
1949         goto suppress_allocation;   //內存壓力很重
1950
1951     /* guarantee minimum buffer size under pressure */
1952     if (kind == SK_MEM_RECV) {
1953         if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])//prot->sysctl_rmem[0]的值等於net.ipv4.tcp_rmem[0]內核參數
1954             return 1;
1955
1956     } else { /* SK_MEM_SEND */
1957         if (sk->sk_type == SOCK_STREAM) {
1958             if (sk->sk_wmem_queued < prot->sysctl_wmem[0])//prot->sysctl_wmem[0]的值等於net.ipv4.tcp_wmem[0]內核參數
1959                 return 1;
1960         } else if (atomic_read(&sk->sk_wmem_alloc) <
1961                prot->sysctl_wmem[0])
1962                 return 1;
1963     }
1964
1965     if (sk_has_memory_pressure(sk)) {
1966         int alloc;
1967
1968         if (!sk_under_memory_pressure(sk))  //tcp_memory_pressure == 0
1969             return 1;
1970         alloc = sk_sockets_allocated_read_positive(sk);    //返回當前已分配的TCP socket的數量
1971         if (sk_prot_mem_limits(sk, 2) > alloc *
1972             sk_mem_pages(sk->sk_wmem_queued +
1973                  atomic_read(&sk->sk_rmem_alloc) +
1974                  sk->sk_forward_alloc))    //net.ipv4.tcp_mem[2] > 發送隊列佔用內存 + 接收隊列佔用內存 + 預分配剩餘內存
1975             return 1;
1976     }
1977
1978 suppress_allocation:
1979
1980     if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1981         sk_stream_moderate_sndbuf(sk);  //縮小snd_buf
1982
1983         /* Fail only if socket is _under_ its sndbuf.
1984          * In this case we cannot block, so that we have to fail.
1985          */
1986         if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)    //爲什麼是超過限制返回1而不是未超過？
1987             return 1;
1988     }
1989
1990     trace_sock_exceed_buf_limit(sk, prot, allocated);
1991    //沒有分配成功，恢復分配前的內存計數
1992     /* Alas. Undo changes. */
1993     sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1994
1995     sk_memory_allocated_sub(sk, amt);
1996
1997     return 0;
1998 }

skb_entail函數會將skb放入發送隊列，並更新緩存信息：

 596 static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 597 {       
 598     struct tcp_sock *tp = tcp_sk(sk);
 599     struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 600         
 601     skb->csum    = 0;
 602     tcb->seq     = tcb->end_seq = tp->write_seq;
 603     tcb->tcp_flags = TCPHDR_ACK;
 604     tcb->sacked  = 0;
 605     skb_header_release(skb);
 606     tcp_add_write_queue_tail(sk, skb);
 607     sk->sk_wmem_queued += skb->truesize; //更新sk_wmem_queued
 608     sk_mem_charge(sk, skb->truesize);
 609     if (tp->nonagle & TCP_NAGLE_PUSH)
 610         tp->nonagle &= ~TCP_NAGLE_PUSH;
 611 }

sk_mem_charge函數會更新預分配緩存的值：

1401 static inline void sk_mem_charge(struct sock *sk, int size)
1402 {       
1403     if (!sk_has_account(sk))
1404         return;
1405     sk->sk_forward_alloc -= size;
1406 }

成功申請非線性區的空間後要使用sk_page_frag_refill函數更新內存信息：

1796 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1797 {                                  
1798     int order;                     
1799                                    
1800     if (pfrag->page) {
1801         if (atomic_read(&pfrag->page->_count) == 1) {
1802             pfrag->offset = 0;
1803             return true;
1804         }       
1805         if (pfrag->offset < pfrag->size)    //page中還有剩餘空間
1806             return true;
1807         put_page(pfrag->page);
1808     }                          
1809                     
1810     /* We restrict high order allocations to users that can afford to wait */
1811     order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1812             
1813     do {
1814         gfp_t gfp = sk->sk_allocation;
1815                 
1816         if (order)
1817             gfp |= __GFP_COMP | __GFP_NOWARN;
1818         pfrag->page = alloc_pages(gfp, order);
1819         if (likely(pfrag->page)) {
1820             pfrag->offset = 0;
1821             pfrag->size = PAGE_SIZE << order;
1822             return true;
1823         }
1824     } while (--order >= 0);
1825    //申請page不成功
1826     sk_enter_memory_pressure(sk);
1827     sk_stream_moderate_sndbuf(sk);
1828     return false;
1829 }

向非線性區填充數據後要使用skb_copy_to_page_nocache更新緩存信息：

1832 static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from,
1833                        struct sk_buff *skb,
1834                        struct page *page,
1835                        int off, int copy)
1836 {               
1837     int err;
1838             
1839     err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
1840                        copy, skb->len);
1841     if (err)
1842         return err;
1843             
1844     skb->len         += copy;
1845     skb->data_len        += copy;
1846     skb->truesize        += copy;
1847     sk->sk_wmem_queued   += copy;
1848     sk_mem_charge(sk, copy);
1849     return 0;
1850 }

tcp_transmit_skb函數在發送skb時會佔用sk->sk_wmem_alloc緩存：

 828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 829                 gfp_t gfp_mask)
 830 {
...
 890     skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
 891               tcp_wfree : sock_wfree;    //skb釋放時調用tcp_wfree或sock_wfree
 892     atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 ...

10.2.2 釋放緩存

tcp_transmit_skb發送出去的skb被釋放時（網卡驅動在發送完畢數據後釋放skb，或IP發送隊列滿導致丟包時）會調用tcp_wfree或sock_wfree函數，並更新sk->sk_wmem_alloc的數值：

 791 void tcp_wfree(struct sk_buff *skb)
 792 {
 793     struct sock *sk = skb->sk;
 794     struct tcp_sock *tp = tcp_sk(sk);
 795 
 796     if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
 797         !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
 798         unsigned long flags;
 799         struct tsq_tasklet *tsq;
 800 
 801         /* Keep a ref on socket.
 802          * This last ref will be released in tcp_tasklet_func()
 803          */
 804         atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
 805 
 806         /* queue this socket to tasklet queue */
 807         local_irq_save(flags);
 808         tsq = &__get_cpu_var(tsq_tasklet);
 809         list_add(&tp->tsq_node, &tsq->head);
 810         tasklet_schedule(&tsq->tasklet);
 811         local_irq_restore(flags);
 812     } else {
 813         sock_wfree(skb);
 814     }
 815 }

sock_wfree函數：

1534 void sock_wfree(struct sk_buff *skb)
1535 {
1536     struct sock *sk = skb->sk;
1537     unsigned int len = skb->truesize;
1538 
1539     if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1540         /*
1541          * Keep a reference on sk_wmem_alloc, this will be released
1542          * after sk_write_space() call
1543          */
1544         atomic_sub(len - 1, &sk->sk_wmem_alloc);
1545         sk->sk_write_space(sk);
1546         len = 1;
1547     }
1548     /*
1549      * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1550      * could not do because of in-flight packets
1551      */
1552     if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1553         __sk_free(sk);
1554 }

收到對端的ACK後，tcp_ack函數會調用tcp_clean_rtx_queue釋放發送緩存中的skb，tcp_clean_rtx_queue函數會調用sk_wmem_free_skb釋放skb並更新內存信息：

1408 static inline void sk_mem_uncharge(struct sock *sk, int size)
1409 {       
1410     if (!sk_has_account(sk))
1411         return;
1412     sk->sk_forward_alloc += size;
1413 }
1414
1415 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
1416 {
1417     sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1418     sk->sk_wmem_queued -= skb->truesize;
1419     sk_mem_uncharge(sk, skb->truesize);
1420     __kfree_skb(skb);
1421 }

收到ACK後TCP會調用tcp_data_snd_check函數嘗試擴大發送緩存：

 4688 static bool tcp_should_expand_sndbuf(const struct sock *sk)
4689 {
4690     const struct tcp_sock *tp = tcp_sk(sk);
4691
4692     /* If the user specified a specific send buffer setting, do
4693      * not modify it.
4694      */
4695     if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4696         return false;
4697
4698     /* If we are under global TCP memory pressure, do not expand.  */
4699     if (sk_under_memory_pressure(sk))
4700         return false;
4701
4702     /* If we are under soft global TCP memory pressure, do not expand.  */
4703     if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4704         return false;
4705
4706     /* If we filled the congestion window, do not expand.  */
4707     if (tp->packets_out >= tp->snd_cwnd)
4708         return false;
4709
4710     return true;
4711 }
...
4719 static void tcp_new_space(struct sock *sk)
4720 {
4721     struct tcp_sock *tp = tcp_sk(sk);
4722
4723     if (tcp_should_expand_sndbuf(sk)) {
4724         int sndmem = SKB_TRUESIZE(max_t(u32,
4725                         tp->rx_opt.mss_clamp,          
4726                         tp->mss_cache) +               
4727                       MAX_TCP_HEADER);               
4728         int demanded = max_t(unsigned int, tp->snd_cwnd,
4729                      tp->reordering + 1);           
4730         sndmem *= 2 * demanded;        
4731         if (sndmem > sk->sk_sndbuf)    
4732             sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
4733         tp->snd_cwnd_stamp = tcp_time_stamp;
4734     }
4735
4736     sk->sk_write_space(sk);    //指向sk_stream_write_space函數，通知應用進程發送緩存有空餘，可以發送數據
4737 }
4738
4739 static void tcp_check_space(struct sock *sk)
4740 {
4741     if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {    //發送隊列減小
4742         sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4743         if (sk->sk_socket &&
4744             test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))    //有進程在等待發送緩存的空間
4745             tcp_new_space(sk);    //嘗試擴大發送緩存
4746     }
4747 }
4748
4749 static inline void tcp_data_snd_check(struct sock *sk)
4750 {
4751     tcp_push_pending_frames(sk);   
4752     tcp_check_space(sk);
4753 }

擴大發送緩存的條件：

（1）由於ACK確認了數據並刪除了skb使得發送隊列減小

（2）應用進程在向內核寫入數據時由於內存不足而等待

（3）應用進程沒有使用SO_SNDBUF socket選項設置snd_buf大小

（4）全局TCP緩存沒有處於極度緊張狀態

（5）全局TCP緩存沒有處於相對緊張狀態（全局已分配TCP內存小於net.ipv4.tcp_mem[0]）

（6）TCP已經發送並在網絡中的包數小於擁塞窗口

10.2 發送緩存管理

10.2.1 使用緩存

10.2.2 釋放緩存

11.3 TCP內核同步

10.2 發送緩存管理

9.6 堅持（Persist）定時器

OpenSSL-Async mode

12.2 擁塞控制簡介

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結