open vswitch研究:datapath

struct vport是OVS的設備結構,個人認爲非常類似於kernel裏的netdev結構

/**
 * struct vport - one port within a datapath
 * @rcu: RCU callback head for deferred destruction.
 * @port_no: Index into @dp's @ports array.
 * @dp: Datapath to which this port belongs.
 * @kobj: Represents /sys/class/net/<devname>/brport.
 * @linkname: The name of the link from /sys/class/net/<datapath>/brif to this
 * &struct vport.  (We keep this around so that we can delete it if the
 * device gets renamed.)  Set to the null string when no link exists.
 * @node: Element in @dp's @port_list.
 * @upcall_pid: The Netlink port to use for packets received on this port that
 * miss the flow table.
 * @hash_node: Element in @dev_table hash table in vport.c.
 * @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
 * @ops: Class structure.
 * @percpu_stats: Points to per-CPU statistics used and maintained by vport
 * @stats_lock: Protects @err_stats and @offset_stats.
 * @err_stats: Points to error statistics used and maintained by vport
 * @offset_stats: Added to actual statistics as a sop to compatibility with
 * XAPI for Citrix XenServer.  Deprecated.
 */ 
struct vport {
    struct rcu_head rcu;
    u16 port_no;
    struct datapath *dp;
    struct kobject kobj;
    char linkname[IFNAMSIZ];
    struct list_head node;
    u32 upcall_pid;

    struct hlist_node hash_node;
    struct hlist_node dp_hash_node;
    const struct vport_ops *ops;

    struct vport_percpu_stats __percpu *percpu_stats;

    spinlock_t stats_lock;
    struct vport_err_stats err_stats;
    struct ovs_vport_stats offset_stats;
};


和vport緊密相關的是struct datapath

/**
 * struct datapath - datapath for flow-based packet switching
 * @rcu: RCU callback head for deferred destruction.
 * @list_node: Element in global 'dps' list.
 * @ifobj: Represents /sys/class/net/<devname>/brif.  Protected by RTNL.
 * @n_flows: Number of flows currently in flow table.
 * @table: Current flow table.  Protected by genl_lock and RCU.
 * @ports: Hash table for ports.  %OVSP_LOCAL port always exists.  Protected by
 * RTNL and RCU.
 * @stats_percpu: Per-CPU datapath statistics.
 * @net: Reference to net namespace.
 *
 * Context: See the comment on locking at the top of datapath.c for additional
 * locking information.
 */
struct datapath {
    struct rcu_head rcu;
    struct list_head list_node;
    struct kobject ifobj;

    /* Flow table. */
    struct flow_table __rcu *table;

    /* Switch ports. */
    struct hlist_head *ports;

    /* Stats. */
    struct dp_stats_percpu __percpu *stats_percpu;

#ifdef CONFIG_NET_NS
    /* Network namespace ref. */
    struct net *net;
#endif
};


我的理解是,無論vport還是datapath都是OVS用的虛擬設備,datapath中包含了多個vport,通過datapath->ports, vport->dp_hash_node的哈希表關聯起來, vport->dp指向vport屬於的datapath

datapath同時包含了一個flow_table


struct ovs_skb_cb用來作爲sk_buff的私有結構,

/**
 * struct ovs_skb_cb - OVS data in skb CB
 * @flow: The flow associated with this packet.  May be %NULL if no flow.
 * @tun_id: ID of the tunnel that encapsulated this packet.  It is 0 if the
 * @ip_summed: Consistently stores L4 checksumming status across different
 * kernel versions.
 * @csum_start: Stores the offset from which to start checksumming independent
 * of the transport header on all kernel versions.
 * packet was not received on a tunnel.
 * @vlan_tci: Provides a substitute for the skb->vlan_tci field on kernels
 * before 2.6.27.
 */
struct ovs_skb_cb {
    struct sw_flow      *flow;
    __be64          tun_id;
#ifdef NEED_CSUM_NORMALIZE
    enum csum_type      ip_summed;
    u16         csum_start;
#endif
#ifdef NEED_VLAN_FIELD
    u16         vlan_tci;
#endif
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)

vlan_tci表示了802.1q的Tag Control Identifier,包括3bits 的Priority Code Point ( aka. CoS ),1bit 的Drop Eligible,和12bits的VLAN ID

tun_id表示了這個skb在OVS中的tunnel ID

flow表示了skb所屬於的流,這是個openflow的概念,sw_flow->sw_flow_key用於唯一標識一個流,sw_flow->sw_flow_actions用於記錄流match了之後的行爲


struct vport_ops定義了vport的行爲,

/**
 * struct vport_ops - definition of a type of virtual port
 *
 * @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
 * @flags: Flags of type VPORT_F_* that influence how the generic vport layer
 * handles this vport.
 * @init: Called at module initialization.  If VPORT_F_REQUIRED is set then the
 * failure of this function will cause the module to not load.  If the flag is
 * not set and initialzation fails then no vports of this type can be created.
 * @exit: Called at module unload.
 * @create: Create a new vport configured as specified.  On success returns
 * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
 * @destroy: Destroys a vport.  Must call vport_free() on the vport but not
 * before an RCU grace period has elapsed.
 * @set_options: Modify the configuration of an existing vport.  May be %NULL
 * if modification is not supported.
 * @get_options: Appends vport-specific attributes for the configuration of an
 * existing vport to a &struct sk_buff.  May be %NULL for a vport that does not
 * have any configuration.
 * @set_addr: Set the device's MAC address.  May be null if not supported.
 * @get_name: Get the device's name.
 * @get_addr: Get the device's MAC address.
 * @get_config: Get the device's configuration.
 * @get_kobj: Get the kobj associated with the device (may return null).
 * @get_dev_flags: Get the device's flags.
 * @is_running: Checks whether the device is running.
 * @get_operstate: Get the device's operating state.
 * @get_ifindex: Get the system interface index associated with the device.
 * May be null if the device does not have an ifindex.
 * @get_mtu: Get the device's MTU.  May be %NULL if the device does not have an
 * MTU (as e.g. some tunnels do not).  Must be implemented if @get_ifindex is
 * implemented.
 * @send: Send a packet on the device.  Returns the length of the packet sent.
 */               

struct vport_ops {
    enum ovs_vport_type type;
    u32 flags;

    /* Called at module init and exit respectively. */
    int (*init)(void);
    void (*exit)(void);

    /* Called with RTNL lock. */
    struct vport *(*create)(const struct vport_parms *);
    void (*destroy)(struct vport *);

    int (*set_options)(struct vport *, struct nlattr *);
    int (*get_options)(const struct vport *, struct sk_buff *);

    int (*set_addr)(struct vport *, const unsigned char *);

    /* Called with rcu_read_lock or RTNL lock. */
    const char *(*get_name)(const struct vport *);
    const unsigned char *(*get_addr)(const struct vport *);
    void (*get_config)(const struct vport *, void *);
    struct kobject *(*get_kobj)(const struct vport *);

    unsigned (*get_dev_flags)(const struct vport *);
    int (*is_running)(const struct vport *);
    unsigned char (*get_operstate)(const struct vport *);

    int (*get_ifindex)(const struct vport *);

    int (*get_mtu)(const struct vport *);

    int (*send)(struct vport *, struct sk_buff *);
};


struct vport 有private data 的數據部分,是緊跟在vport後面的一段線性數據空間,可以通過vport_priv,vport_from_priv來操作

struct vport 其實是個基類,實際應用時會有netdev_vport, internal_vport, patch_vport, gre_vport等,相應的vport_ops爲ovs_netdev_vport_ops, ovs_internal_vport_ops, ovs_patch_vport_ops, ovs_gre_vport_ops


下面跟進vport-netdev設備,對於任意的net_device設備,如果要成爲OVS的vport,需要把OVS的接收函數hook到net_device的包接收函數中,這樣net_device的進包就不會進入常規的內核協議棧中,而是由OVS接過來處理
netdev_vport結構就是一個struct net_device*的封裝

struct netdev_vport {
    struct net_device *dev;
};

我們來看netdev_create函數:

static struct vport *netdev_create(const struct vport_parms *parms)
{   
    struct vport *vport;
    struct netdev_vport *netdev_vport;
    int err;
    
    vport = ovs_vport_alloc(sizeof(struct netdev_vport),
                &ovs_netdev_vport_ops, parms);
    if (IS_ERR(vport)) {
        err = PTR_ERR(vport);
        goto error;
    }
    
    netdev_vport = netdev_vport_priv(vport);
        
    netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
    if (!netdev_vport->dev) {
        err = -ENODEV;
        goto error_free_vport;
    }

    if (netdev_vport->dev->flags & IFF_LOOPBACK ||
        netdev_vport->dev->type != ARPHRD_ETHER ||
        ovs_is_internal_dev(netdev_vport->dev)) {
        err = -EINVAL;
        goto error_put;
    }

    err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
                     vport);
    if (err)
        goto error_put;

    dev_set_promiscuity(netdev_vport->dev, 1);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
    dev_disable_lro(netdev_vport->dev);
#endif
    netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;

    return vport;

error_put:
    dev_put(netdev_vport->dev);
error_free_vport:
    ovs_vport_free(vport);
error:
    return ERR_PTR(err);
}

首先調用 ovs_vport_alloc(sizeof(struct netdev_vport), &ovs_netdev_vport_ops, vport_parms* parms) 創建一個vport結構,後面跟一個vport_private,即一個netdev_vport結構,這兩個結構都在一段連續線性內存中。

接着調用內核函數dev_get_by_name,該函數基於net_device->name_hlist(這裏全局的net_device都基於name_hlist形成一個hash表),通過name查找到對應的net_device

如果發現net_device是loopback, 或不是以太網接口,或是internal vport(internal vport表示這個vport可以把包直接交給內核,可以認爲是內核連到OVS的一個虛擬網口),報錯返回

調用netdev_rx_handler_register把net_device->br_port 設置爲 vport,這樣net_device 算是連到bridge上了,最後調用dev_set_promiscuity設置混雜模式後結束


這裏要注意的是,2.6.36之後的內核,專門用了一個rx_handler函數指針,用來替代之前內核版本中的br_handle_frame_hook,用來接收傳到bridge上的包,本人用的是redhat發佈的2.6.32的內核版本,因此netdev_rx_handler_register裏rx_handler = netdev_frame_hook這步被省去了,取而代之的是在net_init中進行了br_handle_frame_hook的初始化

static int netdev_init(void)
{   
    /* Hook into callback used by the bridge to intercept packets.
     * Parasites we are. */
    br_handle_frame_hook = netdev_frame_hook;
    
    return 0;
}


netdev_frame_hook實際上是調用了netdev_port_receive

/* Must be called with rcu_read_lock. */
static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
{
    if (unlikely(!vport)) {
        kfree_skb(skb);
        return;
    }                

    /* Make our own copy of the packet.  Otherwise we will mangle the
     * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
     * (No one comes after us, since we tell handle_bridge() that we took
     * the packet.) */
    skb = skb_share_check(skb, GFP_ATOMIC);
    if (unlikely(!skb))
        return;

    skb_push(skb, ETH_HLEN);

    if (unlikely(compute_ip_summed(skb, false))) {
        kfree_skb(skb);
        return;
    }
    vlan_copy_skb_tci(skb);

    ovs_vport_receive(vport, skb);
}

可以看出netdev_port_receive實際是調用了ovs_vport_receive

void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
{
    struct vport_percpu_stats *stats;

    stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());

    u64_stats_update_begin(&stats->sync);
    stats->rx_packets++;
    stats->rx_bytes += skb->len;
    u64_stats_update_end(&stats->sync);
        
    if (!(vport->ops->flags & VPORT_F_FLOW))
        OVS_CB(skb)->flow = NULL;

    if (!(vport->ops->flags & VPORT_F_TUN_ID))
        OVS_CB(skb)->tun_id = 0;

    ovs_dp_process_received_packet(vport, skb);
}

ovs_vport_receive實際上更新了統計信息,對ovs_skb_cb->tun_id, ovs_skb_flow做相應更新,接着調用ovs_dp_process_received_packet


/* Must be called with rcu_read_lock. */
void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
{   
    struct datapath *dp = p->dp;
    struct sw_flow *flow;
    struct dp_stats_percpu *stats;
    u64 *stats_counter;
    int error;

    stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());

    if (!OVS_CB(skb)->flow) {
        struct sw_flow_key key;
        int key_len;

        /* Extract flow from 'skb' into 'key'. */
        error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
        if (unlikely(error)) {
            kfree_skb(skb);
            return;
        }

        /* Look up flow. */
        flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table),
                       &key, key_len);
        if (unlikely(!flow)) {
            struct dp_upcall_info upcall;

            upcall.cmd = OVS_PACKET_CMD_MISS;
            upcall.key = &key;
            upcall.userdata = NULL;
            upcall.pid = p->upcall_pid;
            ovs_dp_upcall(dp, skb, &upcall);
            consume_skb(skb);
            stats_counter = &stats->n_missed;
            goto out;
        }

        OVS_CB(skb)->flow = flow;
    }

    stats_counter = &stats->n_hit;

    ovs_flow_used(OVS_CB(skb)->flow, skb);
    ovs_execute_actions(dp, skb);

out:
    /* Update datapath statistics. */
    u64_stats_update_begin(&stats->sync);
    (*stats_counter)++;
    u64_stats_update_end(&stats->sync);
}

函數裏很大一部分是和skb所屬的flow相關,我們知道OVS是遵守openFlow規範的,所以OVS和bridge很大一塊不同就是OVS在處理skb的時候有一個flow的概念在裏面。首先會調用ovs_flow_extract基於skb算出一個key出來,之後調用ovs_flow_tbl_lookup查找這個flow,最後是調用ovs_execute_actions

ovs_execute_actions會調用do_execute_actions,後者一般情況下,會找出out_port出來,然後調用do_output把skb從out_port口發送出去,發送函數爲ovs_vport_send


ovs_vport_send:

/** 
 *  ovs_vport_send - send a packet on a device
 *      
 * @vport: vport on which to send the packet
 * @skb: skb to send
 *  
 * Sends the given packet and returns the length of data sent.  Either RTNL
 * lock or rcu_read_lock must be held.
 */ 
int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{   
    int sent = vport->ops->send(vport, skb);

    if (likely(sent)) {
        struct vport_percpu_stats *stats;
                
        stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
    
        u64_stats_update_begin(&stats->sync);
        stats->tx_packets++;
        stats->tx_bytes += sent;
        u64_stats_update_end(&stats->sync);
    }
    return sent;

還有像ovs_vport_init, ovs_vport_exit, ovs_vport_destroy, ovs_vport_set_xxxx, ovs_vport_get_xxxx 這些函數就不一一介紹了,請自己閱讀datapath/vport.c的源碼


最後來看下vport-netdev的特殊實現,可以看出vport只是一個基類,而vport根據設備的不同而不同,netdev vport應該是最普遍的場景,前面我們看來接收的流程,對於發送,vport->ops->send會調用到netdev_send函數

static int netdev_send(struct vport *vport, struct sk_buff *skb)
{           
    struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
    int mtu = netdev_vport->dev->mtu;
    int len;
            
    if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
        if (net_ratelimit())
            pr_warn("%s: dropped over-mtu packet: %d > %d\n",
                ovs_dp_name(vport->dp), packet_length(skb), mtu);
        goto error;
    }  

這段代碼發現如果skb->len大過了MTU,同時skb又不允許gso,那麼直接丟棄

下面一大段是和vlan相關的操作,如果沒有vlan的話,那就直接通過dev_queue_xmit把skb發送出去

    skb->dev = netdev_vport->dev;
    forward_ip_summed(skb, true);
                
    if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev)) {
        int features; 
            
        features = netif_skb_features(skb);
            
        if (!vlan_tso)
            features &= ~(NETIF_F_TSO | NETIF_F_TSO6 |
                      NETIF_F_UFO | NETIF_F_FSO);
        
        if (netif_needs_gso(skb, features)) {
            struct sk_buff *nskb;
        
            nskb = skb_gso_segment(skb, features);
            if (!nskb) {
                if (unlikely(skb_cloned(skb) &&
                    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) {
                    kfree_skb(skb);
                    return 0;
                }

                skb_shinfo(skb)->gso_type &= ~SKB_GSO_DODGY;
                goto tag;
            }

            if (IS_ERR(nskb)) {
                kfree_skb(skb);
                return 0;
            }
            consume_skb(skb);
            skb = nskb;

            len = 0;

            do {
                nskb = skb->next;
                skb->next = NULL;

                skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
                if (likely(skb)) {
                    len += skb->len;
                    vlan_set_tci(skb, 0);
                    dev_queue_xmit(skb);
                }

                skb = nskb;
            } while (skb);

            return len;
        }

如果設備不支持tso/gso,那麼需要調用skb_gso_segment在內核裏進行分段,如果分段成功會返回一個skb的list,然後對list裏每一個skb,調用dev_queue_xmit發送出去

tag:
        skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
        if (unlikely(!skb))
            return 0;
        vlan_set_tci(skb, 0);
    }

下面是vlan無關部分,直接通過dev_queue_xmit發送skb

    len = skb->len;
    dev_queue_xmit(skb);

    return len;

error:
    kfree_skb(skb);
    ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
    return 0;
}


開發者可以去參考datapath/README文檔




發佈了80 篇原創文章 · 獲贊 21 · 訪問量 47萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章