應用進程使用TCP發送的數據會先放入發送緩存中,TCP的發送緩存是一個skb隊列。這個隊列存在的意義是:保證應用進程交付TCP的數據能夠可靠地交付目的端。在收到對端的ACK之前,發送緩存中的數據不能刪除。
10.2.1 使用緩存
對發送緩存的使用是從tcp_sendmsg函數開始的:
1016 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1017 size_t size)
1018 {
1019 struct iovec *iov;
1020 struct tcp_sock *tp = tcp_sk(sk);
1021 struct sk_buff *skb;
1022 int iovlen, flags, err, copied = 0;
1023 int mss_now = 0, size_goal, copied_syn = 0, offset = 0;
1024 bool sg;
1025 long timeo;
...
1106 if (copy <= 0) {
1107 new_segment:
1108 /* Allocate new segment. If the interface is SG,
1109 * allocate skb fitting to single page.
1110 */
1111 if (!sk_stream_memory_free(sk)) //檢查已佔用內存是否達到限制
1112 goto wait_for_sndbuf;
1113
1114 skb = sk_stream_alloc_skb(sk,
1115 select_size(sk, sg),
1116 sk->sk_allocation); //申請內存
1117 if (!skb)
1118 goto wait_for_memory;
...
1133 skb_entail(sk, skb);
...
1149 } else { //使用skb的非線性區
1150 bool merge = true;
1151 int i = skb_shinfo(skb)->nr_frags;
1152 struct page_frag *pfrag = sk_page_frag(sk);
1153
1154 if (!sk_page_frag_refill(sk, pfrag))
1155 goto wait_for_memory;
...
1168 if (!sk_wmem_schedule(sk, copy))
1169 goto wait_for_memory;
1170
1171 err = skb_copy_to_page_nocache(sk, from, skb,
1172 pfrag->page,
1173 pfrag->offset,
1174 copy);
sk_stream_memory_free函數:
743 static inline bool sk_stream_memory_free(const struct sock *sk)
744 {
745 return sk->sk_wmem_queued < sk->sk_sndbuf;
746 }
當發送隊列中佔用緩存總數小於發送緩存大小時,則發送緩存尚有剩餘空間。
sk_stream_alloc_skb函數用於申請skb:
754 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
755 {
756 struct sk_buff *skb;
757
758 /* The TCP header must be at least 32-bit aligned. */
759 size = ALIGN(size, 4);
760
761 skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
762 if (skb) {
763 if (sk_wmem_schedule(sk, skb->truesize)) { //檢查是否允許使用skb->truesize大小的內存
764 skb_reserve(skb, sk->sk_prot->max_header);
765 /*
766 * Make sure that we have exactly size bytes
767 * available to the caller, no more, no less.
768 */
769 skb->reserved_tailroom = skb->end - skb->tail - size;
770 return skb;
771 }
772 __kfree_skb(skb);
773 } else { //內存緊張
774 sk->sk_prot->enter_memory_pressure(sk);//調用tcp_enter_memory_pressure函數更新Linux MIB,並設置tcp_memory_pressure = 1
775 sk_stream_moderate_sndbuf(sk); //縮小發送緩存大小的上限
776 }
777 return NULL;
778 }
sk_wmem_schedule:1361 static inline bool sk_has_account(struct sock *sk)
1362 {
1363 /* return true if protocol supports memory accounting */
1364 return !!sk->sk_prot->memory_allocated;//指向tcp_memory_allocated,記錄TCP使用的所有內存總數
1365 }
1366
1367 static inline bool sk_wmem_schedule(struct sock *sk, int size)
1368 {
1369 if (!sk_has_account(sk))//對於TCP此判斷不爲真
1370 return true;
1371 return size <= sk->sk_forward_alloc ||
1372 __sk_mem_schedule(sk, size, SK_MEM_SEND);
1373 }
__sk_mem_schedule用於增加預分配內存和已分配內存:1923 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1924 {
1925 struct proto *prot = sk->sk_prot;
1926 int amt = sk_mem_pages(size);
1927 long allocated;
1928 int parent_status = UNDER_LIMIT;
1929
1930 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; //增加預分配內存
1931
1932 allocated = sk_memory_allocated_add(sk, amt, &parent_status); //tcp_memory_allocated += amt
1933
1934 /* Under limit. */
1935 if (parent_status == UNDER_LIMIT &&
1936 allocated <= sk_prot_mem_limits(sk, 0)) { //sk_prot_mem_limits(sk, 0)的值等於net.ipv4.tcp_mem[0]內核參數
1937 sk_leave_memory_pressure(sk);
1938 return 1;
1939 }
1940
1941 /* Under pressure. (we or our parents) */
1942 if ((parent_status > SOFT_LIMIT) ||
1943 allocated > sk_prot_mem_limits(sk, 1)) //sk_prot_mem_limits(sk, 1)的值等於net.ipv4.tcp_mem[1]內核參數
1944 sk_enter_memory_pressure(sk); //設置tcp_memory_pressure爲1
1945
1946 /* Over hard limit (we or our parents) */
1947 if ((parent_status == OVER_LIMIT) ||
1948 (allocated > sk_prot_mem_limits(sk, 2)))//sk_prot_mem_limits(sk, 2)的值等於net.ipv4.tcp_mem[2]內核參數
1949 goto suppress_allocation; //內存壓力很重
1950
1951 /* guarantee minimum buffer size under pressure */
1952 if (kind == SK_MEM_RECV) {
1953 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])//prot->sysctl_rmem[0]的值等於net.ipv4.tcp_rmem[0]內核參數
1954 return 1;
1955
1956 } else { /* SK_MEM_SEND */
1957 if (sk->sk_type == SOCK_STREAM) {
1958 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])//prot->sysctl_wmem[0]的值等於net.ipv4.tcp_wmem[0]內核參數
1959 return 1;
1960 } else if (atomic_read(&sk->sk_wmem_alloc) <
1961 prot->sysctl_wmem[0])
1962 return 1;
1963 }
1964
1965 if (sk_has_memory_pressure(sk)) {
1966 int alloc;
1967
1968 if (!sk_under_memory_pressure(sk)) //tcp_memory_pressure == 0
1969 return 1;
1970 alloc = sk_sockets_allocated_read_positive(sk); //返回當前已分配的TCP socket的數量
1971 if (sk_prot_mem_limits(sk, 2) > alloc *
1972 sk_mem_pages(sk->sk_wmem_queued +
1973 atomic_read(&sk->sk_rmem_alloc) +
1974 sk->sk_forward_alloc)) //net.ipv4.tcp_mem[2] > 發送隊列佔用內存 + 接收隊列佔用內存 + 預分配剩餘內存
1975 return 1;
1976 }
1977
1978 suppress_allocation:
1979
1980 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1981 sk_stream_moderate_sndbuf(sk); //縮小snd_buf
1982
1983 /* Fail only if socket is _under_ its sndbuf.
1984 * In this case we cannot block, so that we have to fail.
1985 */
1986 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) //爲什麼是超過限制返回1而不是未超過?
1987 return 1;
1988 }
1989
1990 trace_sock_exceed_buf_limit(sk, prot, allocated);
1991 //沒有分配成功,恢復分配前的內存計數
1992 /* Alas. Undo changes. */
1993 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1994
1995 sk_memory_allocated_sub(sk, amt);
1996
1997 return 0;
1998 }
skb_entail函數會將skb放入發送隊列,並更新緩存信息: 596 static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
597 {
598 struct tcp_sock *tp = tcp_sk(sk);
599 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
600
601 skb->csum = 0;
602 tcb->seq = tcb->end_seq = tp->write_seq;
603 tcb->tcp_flags = TCPHDR_ACK;
604 tcb->sacked = 0;
605 skb_header_release(skb);
606 tcp_add_write_queue_tail(sk, skb);
607 sk->sk_wmem_queued += skb->truesize; //更新sk_wmem_queued
608 sk_mem_charge(sk, skb->truesize);
609 if (tp->nonagle & TCP_NAGLE_PUSH)
610 tp->nonagle &= ~TCP_NAGLE_PUSH;
611 }
sk_mem_charge函數會更新預分配緩存的值:1401 static inline void sk_mem_charge(struct sock *sk, int size)
1402 {
1403 if (!sk_has_account(sk))
1404 return;
1405 sk->sk_forward_alloc -= size;
1406 }
成功申請非線性區的空間後要使用sk_page_frag_refill函數更新內存信息:1796 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1797 {
1798 int order;
1799
1800 if (pfrag->page) {
1801 if (atomic_read(&pfrag->page->_count) == 1) {
1802 pfrag->offset = 0;
1803 return true;
1804 }
1805 if (pfrag->offset < pfrag->size) //page中還有剩餘空間
1806 return true;
1807 put_page(pfrag->page);
1808 }
1809
1810 /* We restrict high order allocations to users that can afford to wait */
1811 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1812
1813 do {
1814 gfp_t gfp = sk->sk_allocation;
1815
1816 if (order)
1817 gfp |= __GFP_COMP | __GFP_NOWARN;
1818 pfrag->page = alloc_pages(gfp, order);
1819 if (likely(pfrag->page)) {
1820 pfrag->offset = 0;
1821 pfrag->size = PAGE_SIZE << order;
1822 return true;
1823 }
1824 } while (--order >= 0);
1825 //申請page不成功
1826 sk_enter_memory_pressure(sk);
1827 sk_stream_moderate_sndbuf(sk);
1828 return false;
1829 }
向非線性區填充數據後要使用skb_copy_to_page_nocache更新緩存信息:1832 static inline int skb_copy_to_page_nocache(struct sock *sk, char __user *from,
1833 struct sk_buff *skb,
1834 struct page *page,
1835 int off, int copy)
1836 {
1837 int err;
1838
1839 err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
1840 copy, skb->len);
1841 if (err)
1842 return err;
1843
1844 skb->len += copy;
1845 skb->data_len += copy;
1846 skb->truesize += copy;
1847 sk->sk_wmem_queued += copy;
1848 sk_mem_charge(sk, copy);
1849 return 0;
1850 }
tcp_transmit_skb函數在發送skb時會佔用sk->sk_wmem_alloc緩存:
828 static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
829 gfp_t gfp_mask)
830 {
...
890 skb->destructor = (sysctl_tcp_limit_output_bytes > 0) ?
891 tcp_wfree : sock_wfree; //skb釋放時調用tcp_wfree或sock_wfree
892 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
...
10.2.2 釋放緩存
tcp_transmit_skb發送出去的skb被釋放時(網卡驅動在發送完畢數據後釋放skb,或IP發送隊列滿導致丟包時)會調用tcp_wfree或sock_wfree函數,並更新sk->sk_wmem_alloc的數值:
791 void tcp_wfree(struct sk_buff *skb)
792 {
793 struct sock *sk = skb->sk;
794 struct tcp_sock *tp = tcp_sk(sk);
795
796 if (test_and_clear_bit(TSQ_THROTTLED, &tp->tsq_flags) &&
797 !test_and_set_bit(TSQ_QUEUED, &tp->tsq_flags)) {
798 unsigned long flags;
799 struct tsq_tasklet *tsq;
800
801 /* Keep a ref on socket.
802 * This last ref will be released in tcp_tasklet_func()
803 */
804 atomic_sub(skb->truesize - 1, &sk->sk_wmem_alloc);
805
806 /* queue this socket to tasklet queue */
807 local_irq_save(flags);
808 tsq = &__get_cpu_var(tsq_tasklet);
809 list_add(&tp->tsq_node, &tsq->head);
810 tasklet_schedule(&tsq->tasklet);
811 local_irq_restore(flags);
812 } else {
813 sock_wfree(skb);
814 }
815 }
sock_wfree函數:1534 void sock_wfree(struct sk_buff *skb)
1535 {
1536 struct sock *sk = skb->sk;
1537 unsigned int len = skb->truesize;
1538
1539 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1540 /*
1541 * Keep a reference on sk_wmem_alloc, this will be released
1542 * after sk_write_space() call
1543 */
1544 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1545 sk->sk_write_space(sk);
1546 len = 1;
1547 }
1548 /*
1549 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1550 * could not do because of in-flight packets
1551 */
1552 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1553 __sk_free(sk);
1554 }
收到對端的ACK後,tcp_ack函數會調用tcp_clean_rtx_queue釋放發送緩存中的skb,tcp_clean_rtx_queue函數會調用sk_wmem_free_skb釋放skb並更新內存信息:
1408 static inline void sk_mem_uncharge(struct sock *sk, int size)
1409 {
1410 if (!sk_has_account(sk))
1411 return;
1412 sk->sk_forward_alloc += size;
1413 }
1414
1415 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
1416 {
1417 sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
1418 sk->sk_wmem_queued -= skb->truesize;
1419 sk_mem_uncharge(sk, skb->truesize);
1420 __kfree_skb(skb);
1421 }
收到ACK後TCP會調用tcp_data_snd_check函數嘗試擴大發送緩存:
4688 static bool tcp_should_expand_sndbuf(const struct sock *sk)
4689 {
4690 const struct tcp_sock *tp = tcp_sk(sk);
4691
4692 /* If the user specified a specific send buffer setting, do
4693 * not modify it.
4694 */
4695 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
4696 return false;
4697
4698 /* If we are under global TCP memory pressure, do not expand. */
4699 if (sk_under_memory_pressure(sk))
4700 return false;
4701
4702 /* If we are under soft global TCP memory pressure, do not expand. */
4703 if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
4704 return false;
4705
4706 /* If we filled the congestion window, do not expand. */
4707 if (tp->packets_out >= tp->snd_cwnd)
4708 return false;
4709
4710 return true;
4711 }
...
4719 static void tcp_new_space(struct sock *sk)
4720 {
4721 struct tcp_sock *tp = tcp_sk(sk);
4722
4723 if (tcp_should_expand_sndbuf(sk)) {
4724 int sndmem = SKB_TRUESIZE(max_t(u32,
4725 tp->rx_opt.mss_clamp,
4726 tp->mss_cache) +
4727 MAX_TCP_HEADER);
4728 int demanded = max_t(unsigned int, tp->snd_cwnd,
4729 tp->reordering + 1);
4730 sndmem *= 2 * demanded;
4731 if (sndmem > sk->sk_sndbuf)
4732 sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
4733 tp->snd_cwnd_stamp = tcp_time_stamp;
4734 }
4735
4736 sk->sk_write_space(sk); //指向sk_stream_write_space函數,通知應用進程發送緩存有空餘,可以發送數據
4737 }
4738
4739 static void tcp_check_space(struct sock *sk)
4740 {
4741 if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { //發送隊列減小
4742 sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
4743 if (sk->sk_socket &&
4744 test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) //有進程在等待發送緩存的空間
4745 tcp_new_space(sk); //嘗試擴大發送緩存
4746 }
4747 }
4748
4749 static inline void tcp_data_snd_check(struct sock *sk)
4750 {
4751 tcp_push_pending_frames(sk);
4752 tcp_check_space(sk);
4753 }
擴大發送緩存的條件:
(1)由於ACK確認了數據並刪除了skb使得發送隊列減小
(2)應用進程在向內核寫入數據時由於內存不足而等待
(3)應用進程沒有使用SO_SNDBUF socket選項設置snd_buf大小
(4)全局TCP緩存沒有處於極度緊張狀態
(5)全局TCP緩存沒有處於相對緊張狀態(全局已分配TCP內存小於net.ipv4.tcp_mem[0])
(6)TCP已經發送並在網絡中的包數小於擁塞窗口