學習Linux-4.12內核網路協議棧(1.2)——協議棧的初始化(sk_buff)

sk_buff 是網絡數據包的承載,是最關鍵的結構體之一


/**
 *      struct sk_buff - socket buffer
 *      @next: Next buffer in list
 *      @prev: Previous buffer in list
 *      @tstamp: Time we arrived/left
 *      @rbnode: RB tree node, alternative to next/prev for netem/tcp
 *      @sk: Socket we are owned by
 *      @dev: Device we arrived on/are leaving by
 *      @cb: Control buffer. Free for use by every layer. Put private 
vars here
 *      @_skb_refdst: destination entry (with norefcount bit)
 *      @sp: the security path, used for xfrm
 *      @len: Length of actual data
 *      @data_len: Data length
 *      @mac_len: Length of link layer header
 *      @hdr_len: writable header length of cloned skb
 *      @csum: Checksum (must include start/offset pair)
 *      @csum_start: Offset from skb->head where checksumming 
should start
 *      @csum_offset: Offset from csum_start where checksum should be 
stored
 *      @priority: Packet queueing priority
 *      @ignore_df: allow local fragmentation
 *      @cloned: Head may be cloned (check refcnt to be sure)
 *      @ip_summed: Driver fed us an IP checksum
 *      @nohdr: Payload reference only, must not modify header
 *      @pkt_type: Packet class
 *      @fclone: skbuff clone status
 *      @ipvs_property: skbuff is owned by ipvs
 *      @tc_skip_classify: do not classify packet. set by IFB device
 *      @tc_at_ingress: used within tc_classify to distinguish in/egress
 *      @tc_redirected: packet was redirected by a tc action
 *      @tc_from_ingress: if tc_redirected, tc_at_ingress at time of 
redirect
 *      @peeked: this packet has been seen already, so stats have been
 *              done for it, don't do them again
 *      @nf_trace: netfilter packet trace flag
 *      @protocol: Packet protocol from driver
 *      @destructor: Destruct function
 *      @_nfct: Associated connection, if any (with nfctinfo bits)
 *      @nf_bridge: Saved data about a bridged frame - see 
br_netfilter.c
 *      @skb_iif: ifindex of device we arrived on
 *      @tc_index: Traffic control index
 *      @hash: the packet hash
 *      @queue_mapping: Queue mapping for multiqueue devices
 *      @xmit_more: More SKBs are pending for this queue
 *      @ndisc_nodetype: router type (from link layer)
 *      @ooo_okay: allow the mapping of a socket to a queue to be 
changed
 *      @l4_hash: indicate hash is a canonical 4-tuple hash over 
transport
 *              ports.
 *      @sw_hash: indicates hash was computed in software stack
 *      @wifi_acked_valid: wifi_acked was set
 *      @wifi_acked: whether frame was acked on wifi or not
 *      @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *      @dst_pending_confirm: need to confirm neighbour
  *     @napi_id: id of the NAPI struct this skb came from
 *      @secmark: security marking
 *      @mark: Generic packet mark
 *      @vlan_proto: vlan encapsulation protocol
 *      @vlan_tci: vlan tag control information
 *      @inner_protocol: Protocol (encapsulation)
 *      @inner_transport_header: Inner transport layer header 
(encapsulation)
 *      @inner_network_header: Network layer header (encapsulation)
 *      @inner_mac_header: Link layer header (encapsulation)
 *      @transport_header: Transport layer header
 *      @network_header: Network layer header
 *      @mac_header: Link layer header
 *      @tail: Tail pointer
 *      @end: End pointer
 *      @head: Head of buffer
 *      @data: Data head pointer
 *      @truesize: Buffer size
 *      @users: User count - see {datagram,tcp}.c
 */

struct sk_buff {
        union {
                struct {
                        /* These two members must be first. */
                        struct sk_buff          *next;  //用於形成鏈表
                        struct sk_buff          *prev;

                        union {
                                ktime_t         tstamp; 
//標記包的時間戳,數據包出去或者進入的時候會被設置
                                struct skb_mstamp skb_mstamp;
                        };
                };
                struct rb_node  rbnode; /* used in netem & tcp 
stack */
        };
        struct sock             *sk;   
//對應於inet層的sock結構,只有當數據包進入到inet層的時候,該指針纔會被初始化,如果是forward或者IP層的數據包
                                       //不會被初始化
 union {
                struct net_device       *dev;  
//對應接收設備或者發送設備,接收的包指向接收設備,發送的包指向發送設備,它在隨後的過程中會被修改,
                                               //input_dev表示接收數據包的原始網絡設備

               /* Some protocols might use this space to store 
information,
                 * while device pointer would be NULL.
                 * UDP receive path is one user.
                 */
                unsigned long           dev_scratch;
        };
        /*
         * This is the control buffer. It is free to use for every
         * layer. Please put your private variables there. If you
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
        char                    cb[48] __aligned(8); 
//私有數據塊,數據包每到達一層都可以存放自己的私有數據進去,以增強可讀性

        unsigned long           _skb_refdst; 
        void                    (*destructor)(struct sk_buff *skb); 
//skb的析構函數,當該skb被釋放時,調用該函數做一些掃尾的動作,比如釋放數據緩存和sock
#ifdef CONFIG_XFRM
        struct  sec_path        *sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        unsigned long            _nfct;  //netfilter 連接跟蹤
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        struct nf_bridge_info   *nf_bridge;  
#endif
        unsigned int            len,     //skb數據部分的長度
                                data_len;   //聚合分散IO存儲區的數據長度
        __u16                   mac_len, //以太網首部長度
                                hdr_len;  

        /* Following fields are _not_ copied in __copy_skb_header()
         * Note that queue_mapping is here mostly to fill a hole.
         */
        kmemcheck_bitfield_begin(flags1);
        __u16                   queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK     (1 << 7)
#else
#define CLONED_MASK     1
#endif
#define CLONED_OFFSET()         offsetof(struct sk_buff, 
__cloned_offset)

        __u8                    __cloned_offset[0];
        __u8                    cloned:1, //該包釋放被克隆
                                nohdr:1, //標示是否存在協議頭部,如果不存在表示頭部被引用
                                fclone:2, //當前的克隆狀態,表示父skb還是子skb
                                peeked:1,
                                head_frag:1,
                                xmit_more:1,
                                __unused:1; /* one bit hole */
        kmemcheck_bitfield_end(flags1);

        /* fields enclosed in headers_start/headers_end are copied
         * using a single memcpy() in __copy_skb_header()
         */
        /* private: */
        __u32                   headers_start[0];
        /* public: */

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX    (7 << 5)
#else
#define PKT_TYPE_MAX    7
#endif
#define PKT_TYPE_OFFSET()       offsetof(struct sk_buff, 
__pkt_type_offset)

        __u8                    __pkt_type_offset[0];
        __u8                    pkt_type:3; 
//表示幀類型,它是由目的mac決定的,比如PCKET_HOST表示發往本地的包,PACKET_BROADCAST表示廣播包
        __u8                    pfmemalloc:1;
        __u8                    ignore_df:1;  //忽略DF標示

        __u8                    nf_trace:1;
        __u8                    ip_summed:2; 
//標記傳輸層校驗和的狀態,比如完成校驗或者由硬件來完成校驗等
        __u8                    ooo_okay:1;
        __u8                    l4_hash:1;
        __u8                    sw_hash:1;
        __u8                    wifi_acked_valid:1;
        __u8                    wifi_acked:1;

        __u8                    no_fcs:1;
        /* Indicates the inner headers are valid in the skbuff. */
        __u8                    encapsulation:1;
        __u8                    encap_hdr_csum:1;
        __u8                    csum_valid:1;
        __u8                    csum_complete_sw:1;
        __u8                    csum_level:2;
        __u8                    csum_bad:1;

        __u8                    dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
        __u8                    ndisc_nodetype:2;
#endif
        __u8                    ipvs_property:1;
        __u8                    inner_protocol_type:1;
        __u8                    remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
        __u8                    offload_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
        __u8                    tc_skip_classify:1;
        __u8                    tc_at_ingress:1;
        __u8                    tc_redirected:1;
        __u8                    tc_from_ingress:1;
#endif

#ifdef CONFIG_NET_SCHED
        __u16                   tc_index;       /* traffic control index
 */
#endif

        union {
                __wsum          csum;
                struct {
                        __u16   csum_start;
                        __u16   csum_offset;
                };
        };
        __u32                   priority;
        int                     skb_iif;
        __u32                   hash;
        __be16                  vlan_proto;
        __u16                   vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
        union {
                unsigned int    napi_id;
                unsigned int    sender_cpu;
        };
#endif
#ifdef CONFIG_NETWORK_SECMARK
        __u32           secmark;
#endif

        union {
                __u32           mark;
                __u32           reserved_tailroom;
        };

        union {
                __be16          inner_protocol;
                __u8            inner_ipproto;
        };

        __u16                   inner_transport_header;
        __u16                   inner_network_header;
        __u16                   inner_mac_header;

        __be16                  
protocol;//在MAC層看到的上層協議類型,比如ARP,IP,IPv6,PPP等,netif_rx收到包以後,會在協議處理函數調用前被初
始化,以指明交給哪個協議處理
        __u16                   transport_header;
        __u16                   network_header;
        __u16                   mac_header; //

        /* private: */
        __u32                   headers_end[0];
        /* public: */

        /* These elements must be at the end, see alloc_skb() for 
details.  */
        sk_buff_data_t          tail;  //這幾個用於指向不同的skb位置
        sk_buff_data_t          end;
        unsigned char           *head,
                                *data;
        unsigned int            truesize; //整個數據緩存區的總長度
        atomic_t                
users;//引用計數器,用來標記有多少實體引用了該SKB,主要是用來確定skb的釋放時機
};

sk_buff結構體其實內容不少,但是掌握其中各個字段的對後面的分析會很重要。

這裏需要明確的是sk_buff存放在一塊內存裏面,而真正的數據存放在另外一塊內存裏面,sk_buff通過指針指向這塊數據區,在數據區的緊接的尾部有一個skb_shared_info結構體,用於控制該數據的IP分片。


/*
 This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
struct skb_shared_info {
    unsigned short  _unused;
    unsigned char   nr_frags;
    __u8        tx_flags;
    unsigned short  gso_size;
    /* Warning: this field is not always filled in (UFO)! */
    unsigned short  gso_segs;
    struct sk_buff  *frag_list;
    struct skb_shared_hwtstamps hwtstamps;
    unsigned int    gso_type;
    u32     tskey;
    __be32          ip6_frag_id;

    /*
     * Warning : all fields before dataref are cleared in __alloc_skb()
     */
    atomic_t    dataref;

    /* Intermediate layers must ensure that destructor_arg
     * remains valid until skb destructor */
    void *      destructor_arg;

    /* must be last field, see pskb_expand_head() */
    skb_frag_t  frags[MAX_SKB_FRAGS];
};

下圖是sock,skb,skb_shinfo和數據緩存之間的關係,一個sock對應的是一個應用層的socket,它的收包隊列由skb構成,skb通過alloc_skb分配在高速緩存裏面。

skb通過指針指向真實數據,這些數據存放在內存裏面,head指向頭部,end指向尾部,同時end指向的位置也是skb_shinfo的起始位置




一個sock可能收到很多個包,所以每個包都串聯在同一個鏈表裏面



如果有開啓聚合分散I/O分片的報文共享內存,則指針情況可能是這樣的:</p>



下面是操作SKB的一些函數,他們像“瑞士軍刀”一樣實現對skb的快捷操作:</p>




下面是調用alloc_skb後,得到的skb結構如下圖:



 186 /**
 187  *  __alloc_skb -   allocate a network buffer
 188  *  @size: size to allocate
 189  *  @gfp_mask: allocation mask
 190  *  @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 191  *      instead of head cache and allocate a cloned (child) skb.
 192  *      If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 193  *      allocations in case the data is required for writeback
 194  *  @node: numa node to allocate memory on
 195  *
 196  *  Allocate a new &sk_buff. The returned buffer has no 
headroom and a
 197  *  tail room of at least size bytes. The object has a reference 
count
 198  *  of one. The return is the buffer. On a failure the return is 
%NULL.
 199  *
 200  *  Buffers may only be allocated from interrupts using a @gfp_mask
 of
 201  *  %GFP_ATOMIC.
 202  */
 203 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 204                 int flags, int node)
 205 {
 206     struct kmem_cache *cache;
 207     struct skb_shared_info *shinfo;
 208     struct sk_buff *skb;
 209     u8 *data;
 210     bool pfmemalloc;
 211
 212     cache = (flags & SKB_ALLOC_FCLONE)
 213         ? skbuff_fclone_cache : skbuff_head_cache;  //決定在哪個高速緩存分配
 214
 215     if (sk_memalloc_socks() && (flags & 
SKB_ALLOC_RX))
 216         gfp_mask |= __GFP_MEMALLOC;
 217
 218     /* Get the HEAD */
 219     skb = kmem_cache_alloc_node(cache, gfp_mask & 
~__GFP_DMA, node);  //在高速緩存裏面申請一塊空間給SKB
 220     if (!skb)
 221         goto out;
 222     prefetchw(skb);
 224     /* We do our best to align skb_shared_info on a separate cache
 225      * line. It usually works because kmalloc(X > 
SMP_CACHE_BYTES) gives
 226      * aligned memory blocks, unless SLUB/SLAB debug is enabled.
 227      * Both skb->head and skb_shared_info are cache line 
aligned.
 228      */
 229     size = SKB_DATA_ALIGN(size);  //數據區對其後的大小
 230     size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));  
//加上skb_shared_info後的大小
 231     data = kmalloc_reserve(size, gfp_mask, node, 
&pfmemalloc);  //在內存中分配這塊數據區
 232     if (!data)
 233         goto nodata;
 234     /* kmalloc(size) might give us more room than requested.
 235      * Put skb_shared_info exactly at the end of allocated zone,
 236      * to allow max possible filling before reallocation.
 237      */
 238     size = SKB_WITH_OVERHEAD(ksize(data));
 239     prefetchw(data + size);
 240
 241     /*
 242      * Only clear those fields we need to clear, not those that we 
will
 243      * actually initialise below. Hence, don't put any more fields 
after
 244      * the tail pointer in struct sk_buff!
 245      */
 246     memset(skb, 0, offsetof(struct sk_buff, tail));
 247     /* Account for allocated memory : skb + skb->head */
 248     skb->truesize = SKB_TRUESIZE(size);
 249     skb->pfmemalloc = pfmemalloc;
 250     atomic_set(&skb->users, 1);
 251     skb->head = data;
 252     skb->data = data;
 253     skb_reset_tail_pointer(skb);
 254     skb->end = skb->tail + size;
 255     skb->mac_header = (typeof(skb->mac_header))~0U;
 256     skb->transport_header = 
(typeof(skb->transport_header))~0U;
258     /* make sure we initialize shinfo sequentially */
 259     shinfo = skb_shinfo(skb);
 260     memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 261     atomic_set(&shinfo->dataref, 1);
 262     kmemcheck_annotate_variable(shinfo->destructor_arg);
 263
 264     if (flags & SKB_ALLOC_FCLONE) {
 265         struct sk_buff_fclones *fclones;
 266
 267         fclones = container_of(skb, struct sk_buff_fclones, skb1);
 268
 269         kmemcheck_annotate_bitfield(&fclones->skb2, 
flags1);
 270         skb->fclone = SKB_FCLONE_ORIG;
 271         atomic_set(&fclones->fclone_ref, 1);
 272
 273         fclones->skb2.fclone = SKB_FCLONE_CLONE;
 274     }
 275 out:
 276     return skb;
 277 nodata:
 278     kmem_cache_free(cache, skb);
 279     skb = NULL;
 280     goto out;
 281 }
 282 EXPORT_SYMBOL(__alloc_skb);




發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章