struct vport是OVS的設備結構,個人認爲非常類似於kernel裏的netdev結構
/**
* struct vport - one port within a datapath
* @rcu: RCU callback head for deferred destruction.
* @port_no: Index into @dp's @ports array.
* @dp: Datapath to which this port belongs.
* @kobj: Represents /sys/class/net/<devname>/brport.
* @linkname: The name of the link from /sys/class/net/<datapath>/brif to this
* &struct vport. (We keep this around so that we can delete it if the
* device gets renamed.) Set to the null string when no link exists.
* @node: Element in @dp's @port_list.
* @upcall_pid: The Netlink port to use for packets received on this port that
* miss the flow table.
* @hash_node: Element in @dev_table hash table in vport.c.
* @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
* @ops: Class structure.
* @percpu_stats: Points to per-CPU statistics used and maintained by vport
* @stats_lock: Protects @err_stats and @offset_stats.
* @err_stats: Points to error statistics used and maintained by vport
* @offset_stats: Added to actual statistics as a sop to compatibility with
* XAPI for Citrix XenServer. Deprecated.
*/
struct vport {
struct rcu_head rcu;
u16 port_no;
struct datapath *dp;
struct kobject kobj;
char linkname[IFNAMSIZ];
struct list_head node;
u32 upcall_pid;
struct hlist_node hash_node;
struct hlist_node dp_hash_node;
const struct vport_ops *ops;
struct vport_percpu_stats __percpu *percpu_stats;
spinlock_t stats_lock;
struct vport_err_stats err_stats;
struct ovs_vport_stats offset_stats;
};
和vport緊密相關的是struct datapath
/**
* struct datapath - datapath for flow-based packet switching
* @rcu: RCU callback head for deferred destruction.
* @list_node: Element in global 'dps' list.
* @ifobj: Represents /sys/class/net/<devname>/brif. Protected by RTNL.
* @n_flows: Number of flows currently in flow table.
* @table: Current flow table. Protected by genl_lock and RCU.
* @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by
* RTNL and RCU.
* @stats_percpu: Per-CPU datapath statistics.
* @net: Reference to net namespace.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
*/
struct datapath {
struct rcu_head rcu;
struct list_head list_node;
struct kobject ifobj;
/* Flow table. */
struct flow_table __rcu *table;
/* Switch ports. */
struct hlist_head *ports;
/* Stats. */
struct dp_stats_percpu __percpu *stats_percpu;
#ifdef CONFIG_NET_NS
/* Network namespace ref. */
struct net *net;
#endif
};
我的理解是,無論vport還是datapath都是OVS用的虛擬設備,datapath中包含了多個vport,通過datapath->ports, vport->dp_hash_node的哈希表關聯起來, vport->dp指向vport屬於的datapath
datapath同時包含了一個flow_table
struct ovs_skb_cb用來作爲sk_buff的私有結構,
/**
* struct ovs_skb_cb - OVS data in skb CB
* @flow: The flow associated with this packet. May be %NULL if no flow.
* @tun_id: ID of the tunnel that encapsulated this packet. It is 0 if the
* @ip_summed: Consistently stores L4 checksumming status across different
* kernel versions.
* @csum_start: Stores the offset from which to start checksumming independent
* of the transport header on all kernel versions.
* packet was not received on a tunnel.
* @vlan_tci: Provides a substitute for the skb->vlan_tci field on kernels
* before 2.6.27.
*/
struct ovs_skb_cb {
struct sw_flow *flow;
__be64 tun_id;
#ifdef NEED_CSUM_NORMALIZE
enum csum_type ip_summed;
u16 csum_start;
#endif
#ifdef NEED_VLAN_FIELD
u16 vlan_tci;
#endif
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
vlan_tci表示了802.1q的Tag Control Identifier,包括3bits 的Priority Code Point ( aka. CoS ),1bit 的Drop Eligible,和12bits的VLAN ID
tun_id表示了這個skb在OVS中的tunnel ID
flow表示了skb所屬於的流,這是個openflow的概念,sw_flow->sw_flow_key用於唯一標識一個流,sw_flow->sw_flow_actions用於記錄流match了之後的行爲
struct vport_ops定義了vport的行爲,
/**
* struct vport_ops - definition of a type of virtual port
*
* @type: %OVS_VPORT_TYPE_* value for this type of virtual port.
* @flags: Flags of type VPORT_F_* that influence how the generic vport layer
* handles this vport.
* @init: Called at module initialization. If VPORT_F_REQUIRED is set then the
* failure of this function will cause the module to not load. If the flag is
* not set and initialzation fails then no vports of this type can be created.
* @exit: Called at module unload.
* @create: Create a new vport configured as specified. On success returns
* a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value.
* @destroy: Destroys a vport. Must call vport_free() on the vport but not
* before an RCU grace period has elapsed.
* @set_options: Modify the configuration of an existing vport. May be %NULL
* if modification is not supported.
* @get_options: Appends vport-specific attributes for the configuration of an
* existing vport to a &struct sk_buff. May be %NULL for a vport that does not
* have any configuration.
* @set_addr: Set the device's MAC address. May be null if not supported.
* @get_name: Get the device's name.
* @get_addr: Get the device's MAC address.
* @get_config: Get the device's configuration.
* @get_kobj: Get the kobj associated with the device (may return null).
* @get_dev_flags: Get the device's flags.
* @is_running: Checks whether the device is running.
* @get_operstate: Get the device's operating state.
* @get_ifindex: Get the system interface index associated with the device.
* May be null if the device does not have an ifindex.
* @get_mtu: Get the device's MTU. May be %NULL if the device does not have an
* MTU (as e.g. some tunnels do not). Must be implemented if @get_ifindex is
* implemented.
* @send: Send a packet on the device. Returns the length of the packet sent.
*/
struct vport_ops {
enum ovs_vport_type type;
u32 flags;
/* Called at module init and exit respectively. */
int (*init)(void);
void (*exit)(void);
/* Called with RTNL lock. */
struct vport *(*create)(const struct vport_parms *);
void (*destroy)(struct vport *);
int (*set_options)(struct vport *, struct nlattr *);
int (*get_options)(const struct vport *, struct sk_buff *);
int (*set_addr)(struct vport *, const unsigned char *);
/* Called with rcu_read_lock or RTNL lock. */
const char *(*get_name)(const struct vport *);
const unsigned char *(*get_addr)(const struct vport *);
void (*get_config)(const struct vport *, void *);
struct kobject *(*get_kobj)(const struct vport *);
unsigned (*get_dev_flags)(const struct vport *);
int (*is_running)(const struct vport *);
unsigned char (*get_operstate)(const struct vport *);
int (*get_ifindex)(const struct vport *);
int (*get_mtu)(const struct vport *);
int (*send)(struct vport *, struct sk_buff *);
};
struct vport 有private data 的數據部分,是緊跟在vport後面的一段線性數據空間,可以通過vport_priv,vport_from_priv來操作
struct vport 其實是個基類,實際應用時會有netdev_vport, internal_vport, patch_vport, gre_vport等,相應的vport_ops爲ovs_netdev_vport_ops, ovs_internal_vport_ops, ovs_patch_vport_ops, ovs_gre_vport_ops
下面跟進vport-netdev設備,對於任意的net_device設備,如果要成爲OVS的vport,需要把OVS的接收函數hook到net_device的包接收函數中,這樣net_device的進包就不會進入常規的內核協議棧中,而是由OVS接過來處理
netdev_vport結構就是一個struct net_device*的封裝
struct netdev_vport {
struct net_device *dev;
};
我們來看netdev_create函數:
static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
struct netdev_vport *netdev_vport;
int err;
vport = ovs_vport_alloc(sizeof(struct netdev_vport),
&ovs_netdev_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}
netdev_vport = netdev_vport_priv(vport);
netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
if (!netdev_vport->dev) {
err = -ENODEV;
goto error_free_vport;
}
if (netdev_vport->dev->flags & IFF_LOOPBACK ||
netdev_vport->dev->type != ARPHRD_ETHER ||
ovs_is_internal_dev(netdev_vport->dev)) {
err = -EINVAL;
goto error_put;
}
err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_put;
dev_set_promiscuity(netdev_vport->dev, 1);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
dev_disable_lro(netdev_vport->dev);
#endif
netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
return vport;
error_put:
dev_put(netdev_vport->dev);
error_free_vport:
ovs_vport_free(vport);
error:
return ERR_PTR(err);
}
首先調用 ovs_vport_alloc(sizeof(struct netdev_vport), &ovs_netdev_vport_ops, vport_parms* parms) 創建一個vport結構,後面跟一個vport_private,即一個netdev_vport結構,這兩個結構都在一段連續線性內存中。
接着調用內核函數dev_get_by_name,該函數基於net_device->name_hlist(這裏全局的net_device都基於name_hlist形成一個hash表),通過name查找到對應的net_device
如果發現net_device是loopback, 或不是以太網接口,或是internal vport(internal vport表示這個vport可以把包直接交給內核,可以認爲是內核連到OVS的一個虛擬網口),報錯返回
調用netdev_rx_handler_register把net_device->br_port 設置爲 vport,這樣net_device 算是連到bridge上了,最後調用dev_set_promiscuity設置混雜模式後結束
這裏要注意的是,2.6.36之後的內核,專門用了一個rx_handler函數指針,用來替代之前內核版本中的br_handle_frame_hook,用來接收傳到bridge上的包,本人用的是redhat發佈的2.6.32的內核版本,因此netdev_rx_handler_register裏rx_handler = netdev_frame_hook這步被省去了,取而代之的是在net_init中進行了br_handle_frame_hook的初始化
static int netdev_init(void)
{
/* Hook into callback used by the bridge to intercept packets.
* Parasites we are. */
br_handle_frame_hook = netdev_frame_hook;
return 0;
}
netdev_frame_hook實際上是調用了netdev_port_receive:
/* Must be called with rcu_read_lock. */
static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
{
if (unlikely(!vport)) {
kfree_skb(skb);
return;
}
/* Make our own copy of the packet. Otherwise we will mangle the
* packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
* (No one comes after us, since we tell handle_bridge() that we took
* the packet.) */
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
return;
skb_push(skb, ETH_HLEN);
if (unlikely(compute_ip_summed(skb, false))) {
kfree_skb(skb);
return;
}
vlan_copy_skb_tci(skb);
ovs_vport_receive(vport, skb);
}
可以看出netdev_port_receive實際是調用了ovs_vport_receive:
void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
{
struct vport_percpu_stats *stats;
stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
u64_stats_update_begin(&stats->sync);
stats->rx_packets++;
stats->rx_bytes += skb->len;
u64_stats_update_end(&stats->sync);
if (!(vport->ops->flags & VPORT_F_FLOW))
OVS_CB(skb)->flow = NULL;
if (!(vport->ops->flags & VPORT_F_TUN_ID))
OVS_CB(skb)->tun_id = 0;
ovs_dp_process_received_packet(vport, skb);
}
ovs_vport_receive實際上更新了統計信息,對ovs_skb_cb->tun_id, ovs_skb_flow做相應更新,接着調用ovs_dp_process_received_packet
/* Must be called with rcu_read_lock. */
void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb)
{
struct datapath *dp = p->dp;
struct sw_flow *flow;
struct dp_stats_percpu *stats;
u64 *stats_counter;
int error;
stats = per_cpu_ptr(dp->stats_percpu, smp_processor_id());
if (!OVS_CB(skb)->flow) {
struct sw_flow_key key;
int key_len;
/* Extract flow from 'skb' into 'key'. */
error = ovs_flow_extract(skb, p->port_no, &key, &key_len);
if (unlikely(error)) {
kfree_skb(skb);
return;
}
/* Look up flow. */
flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table),
&key, key_len);
if (unlikely(!flow)) {
struct dp_upcall_info upcall;
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.key = &key;
upcall.userdata = NULL;
upcall.pid = p->upcall_pid;
ovs_dp_upcall(dp, skb, &upcall);
consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}
OVS_CB(skb)->flow = flow;
}
stats_counter = &stats->n_hit;
ovs_flow_used(OVS_CB(skb)->flow, skb);
ovs_execute_actions(dp, skb);
out:
/* Update datapath statistics. */
u64_stats_update_begin(&stats->sync);
(*stats_counter)++;
u64_stats_update_end(&stats->sync);
}
函數裏很大一部分是和skb所屬的flow相關,我們知道OVS是遵守openFlow規範的,所以OVS和bridge很大一塊不同就是OVS在處理skb的時候有一個flow的概念在裏面。首先會調用ovs_flow_extract基於skb算出一個key出來,之後調用ovs_flow_tbl_lookup查找這個flow,最後是調用ovs_execute_actions
ovs_execute_actions會調用do_execute_actions,後者一般情況下,會找出out_port出來,然後調用do_output把skb從out_port口發送出去,發送函數爲ovs_vport_send
ovs_vport_send:
/**
* ovs_vport_send - send a packet on a device
*
* @vport: vport on which to send the packet
* @skb: skb to send
*
* Sends the given packet and returns the length of data sent. Either RTNL
* lock or rcu_read_lock must be held.
*/
int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
int sent = vport->ops->send(vport, skb);
if (likely(sent)) {
struct vport_percpu_stats *stats;
stats = per_cpu_ptr(vport->percpu_stats, smp_processor_id());
u64_stats_update_begin(&stats->sync);
stats->tx_packets++;
stats->tx_bytes += sent;
u64_stats_update_end(&stats->sync);
}
return sent;
}
還有像ovs_vport_init, ovs_vport_exit, ovs_vport_destroy, ovs_vport_set_xxxx, ovs_vport_get_xxxx 這些函數就不一一介紹了,請自己閱讀datapath/vport.c的源碼
最後來看下vport-netdev的特殊實現,可以看出vport只是一個基類,而vport根據設備的不同而不同,netdev vport應該是最普遍的場景,前面我們看來接收的流程,對於發送,vport->ops->send會調用到netdev_send函數
static int netdev_send(struct vport *vport, struct sk_buff *skb)
{
struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
int mtu = netdev_vport->dev->mtu;
int len;
if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
if (net_ratelimit())
pr_warn("%s: dropped over-mtu packet: %d > %d\n",
ovs_dp_name(vport->dp), packet_length(skb), mtu);
goto error;
}
這段代碼發現如果skb->len大過了MTU,同時skb又不允許gso,那麼直接丟棄
下面一大段是和vlan相關的操作,如果沒有vlan的話,那就直接通過dev_queue_xmit把skb發送出去
skb->dev = netdev_vport->dev;
forward_ip_summed(skb, true);
if (vlan_tx_tag_present(skb) && !dev_supports_vlan_tx(skb->dev)) {
int features;
features = netif_skb_features(skb);
if (!vlan_tso)
features &= ~(NETIF_F_TSO | NETIF_F_TSO6 |
NETIF_F_UFO | NETIF_F_FSO);
if (netif_needs_gso(skb, features)) {
struct sk_buff *nskb;
nskb = skb_gso_segment(skb, features);
if (!nskb) {
if (unlikely(skb_cloned(skb) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) {
kfree_skb(skb);
return 0;
}
skb_shinfo(skb)->gso_type &= ~SKB_GSO_DODGY;
goto tag;
}
if (IS_ERR(nskb)) {
kfree_skb(skb);
return 0;
}
consume_skb(skb);
skb = nskb;
len = 0;
do {
nskb = skb->next;
skb->next = NULL;
skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
if (likely(skb)) {
len += skb->len;
vlan_set_tci(skb, 0);
dev_queue_xmit(skb);
}
skb = nskb;
} while (skb);
return len;
}
如果設備不支持tso/gso,那麼需要調用skb_gso_segment在內核裏進行分段,如果分段成功會返回一個skb的list,然後對list裏每一個skb,調用dev_queue_xmit發送出去
tag:
skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
if (unlikely(!skb))
return 0;
vlan_set_tci(skb, 0);
}
下面是vlan無關部分,直接通過dev_queue_xmit發送skb
len = skb->len;
dev_queue_xmit(skb);
return len;
error:
kfree_skb(skb);
ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
return 0;
}
開發者可以去參考datapath/README文檔