網絡報文到達主機後,最終會到達協議棧的netif_receive_skb函數,該函數會通過設備對象的rx_handler函數把報文交給OVS處理。 而該rx_handler函數其實就是OVS 定義的netdev_frame_hook函數,本篇內容就是從netdev_frame_hook函數開始,分析報文在datapath中的整個主處理過程。
1、netdev_frame_hook函數
該函數爲OVS與內核橋接點,所以函數定義受內核定義影響
static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
port_receive(skb);
return RX_HANDLER_CONSUMED;
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) || \
defined HAVE_RHEL_OVS_HOOK
/* Called with rcu_read_lock and bottom-halves disabled. */
static struct sk_buff *netdev_frame_hook(struct sk_buff *skb)
{
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return skb;
port_receive(skb);
return NULL;
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
/*
* Used as br_handle_frame_hook. (Cannot run bridge at the same time, even on
* different set of devices!)
*/
/* Called with rcu_read_lock and bottom-halves disabled. */
static struct sk_buff *netdev_frame_hook(struct net_bridge_port *p,
struct sk_buff *skb)
{
port_receive(skb);
return NULL;
}
#else
#error
#endif
2、port_receive函數
#ifndef HAVE_METADATA_DST
#define port_receive(skb) netdev_port_receive(skb, NULL)
#else
#define port_receive(skb) netdev_port_receive(skb, skb_tunnel_info(skb)) //報文中包含隧道信息,說明協議棧支持隧道報文了
#endif
3、netdev_port_receive函數
/* Must be called with rcu_read_lock. */
void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info)
{
struct vport *vport;
vport = ovs_netdev_get_vport(skb->dev); //通過netdev設備獲得vport對象,是實現在datapath中轉發的基礎
if (unlikely(!vport))
goto error;
if (unlikely(skb_warn_if_lro(skb)))
goto error;
/* Make our own copy of the packet. Otherwise we will mangle the
* packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
*/
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
return;
skb_push(skb, ETH_HLEN);
ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
ovs_vport_receive(vport, skb, tun_info); //基於vport進行處理
return;
error:
kfree_skb(skb);
}
4、ovs_vport_receive函數
int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
const struct ip_tunnel_info *tun_info)
{
struct sw_flow_key key;
int error;
OVS_CB(skb)->input_vport = vport;
OVS_CB(skb)->mru = 0;
if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) {
u32 mark;
mark = skb->mark;
skb_scrub_packet(skb, true);
skb->mark = mark;
tun_info = NULL;
}
ovs_skb_init_inner_protocol(skb);
skb_clear_ovs_gso_cb(skb);
/* Extract flow from 'skb' into 'key'. */
error = ovs_flow_key_extract(tun_info, skb, &key); //根據報文生成key
if (unlikely(error)) {
kfree_skb(skb);
return error;
}
ovs_dp_process_packet(skb, &key); //報文處理
return 0;
}
5、ovs_dp_process_packet函數
void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key)
{
const struct vport *p = OVS_CB(skb)->input_vport;
struct datapath *dp = p->dp;
struct sw_flow *flow;
struct sw_flow_actions *sf_acts;
struct dp_stats_percpu *stats;
u64 *stats_counter;
u32 n_mask_hit;
stats = this_cpu_ptr(dp->stats_percpu);
/* Look up flow. */
flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), //查詢轉發表
&n_mask_hit);
if (unlikely(!flow)) { //如果沒有查到流表,則上送的upcall線程處理
struct dp_upcall_info upcall;
int error;
memset(&upcall, 0, sizeof(upcall));
upcall.cmd = OVS_PACKET_CMD_MISS;
upcall.portid = ovs_vport_find_upcall_portid(p, skb);
upcall.mru = OVS_CB(skb)->mru;
error = ovs_dp_upcall(dp, skb, key, &upcall);
if (unlikely(error))
kfree_skb(skb);
else
consume_skb(skb);
stats_counter = &stats->n_missed;
goto out;
}
ovs_flow_stats_update(flow, key->tp.flags, skb);
sf_acts = rcu_dereference(flow->sf_acts); //獲取action
ovs_execute_actions(dp, skb, sf_acts, key); //對報文執行action
stats_counter = &stats->n_hit;
out:
/* Update datapath statistics. */
u64_stats_update_begin(&stats->syncp);
(*stats_counter)++;
stats->n_mask_hit += n_mask_hit;
u64_stats_update_end(&stats->syncp);
}
背景:
- 報文匹配的流程可以說是對一個switch效率影響最大的地方,現在已經有很多廠家(包括一些研究院)說可以做到比OVS效率高十倍的流表匹配。
- 當然這是有可能的,畢竟都是純軟件,非常容易創新。而ovs也沒有做多級流表,跟of標準上不一定比得過別人。我們接下來就分析一下OVS的匹配流程。
TIPS:
- 整體的匹配邏輯很簡單,都是OpenFlow規定好的。
- 報文通過dp時先查找精確匹配表(facet),如果找不到,那麼進行upcall,上送到用戶態。
- 在用戶態會查找模糊匹配表(table),這個地方可以說是較沒有效率的地方。下面我們看一看具體的代碼。
調用流程(內核):
ovs_vport_receive->ovs_dp_process_received_packet->ovs_flow_tbl_lookup->ovs_dp_upcall->queue_userspace_packet
調用流程(用戶態):
handle_miss_upcalls->handle_flow_miss->rule_dpif_lookup->rule_dpif_lookup__->classifier_lookup->find_match
1、在handle_miss_upcalls裏解析了報文,生成了報文的精確匹配項,如果我們把它遮蓋掉(mask)一部分,那麼它就是一個模糊匹配的flow了。實際上ovs也就是這麼做的。
2、注意到每個flow table擁有一個cls,一個cls擁有多個cls_table,cls_table中擁有mask完全一致的flow。
3、而在find_match中,ovs將精確匹配項對應cls_table的mask位置0,然後通過hash查找是否有匹配的flow。
6、ovs_execute_actions函數
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
int level = this_cpu_read(exec_actions_level);
int err;
if (unlikely(level >= EXEC_ACTIONS_LEVEL_LIMIT)) {
if (net_ratelimit())
pr_warn("%s: packet loop detected, dropping.\n",
ovs_dp_name(dp));
kfree_skb(skb);
return -ELOOP;
}
this_cpu_inc(exec_actions_level);
err = do_execute_actions(dp, skb, key,
acts->actions, acts->actions_len); //執行action
if (!level)
process_deferred_actions(dp);
this_cpu_dec(exec_actions_level);
/* This return status currently does not reflect the errors
* encounted during deferred actions execution. Probably needs to
* be fixed in the future.
*/
return err;
}
7、do_execute_actions函數
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key,
const struct nlattr *attr, int len)
{
/* Every output action needs a separate clone of 'skb', but the common
* case is just a single output action, so that doing a clone and
* then freeing the original skbuff is wasteful. So the following code
* is slightly obscure just to avoid that.
*/
int prev_port = -1;
const struct nlattr *a;
int rem;
for (a = attr, rem = len; rem > 0;
a = nla_next(a, &rem)) {
int err = 0;
if (unlikely(prev_port != -1)) { //從某個端口發出
struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); //克隆報文
if (out_skb)
do_output(dp, out_skb, prev_port, key); //輸出報文,爲簡化起見,本篇以output爲例,其他action在後續分析
prev_port = -1;
}
switch (nla_type(a)) {
case OVS_ACTION_ATTR_OUTPUT:
prev_port = nla_get_u32(a);
break;
case OVS_ACTION_ATTR_USERSPACE:
output_userspace(dp, skb, key, a, attr, len);
break;
case OVS_ACTION_ATTR_HASH:
execute_hash(skb, key, a);
break;
case OVS_ACTION_ATTR_PUSH_MPLS:
err = push_mpls(skb, key, nla_data(a));
break;
case OVS_ACTION_ATTR_POP_MPLS:
err = pop_mpls(skb, key, nla_get_be16(a));
break;
case OVS_ACTION_ATTR_PUSH_VLAN:
err = push_vlan(skb, key, nla_data(a));
break;
case OVS_ACTION_ATTR_POP_VLAN:
err = pop_vlan(skb, key);
break;
case OVS_ACTION_ATTR_RECIRC:
err = execute_recirc(dp, skb, key, a, rem);
if (nla_is_last(a, rem)) {
/* If this is the last action, the skb has
* been consumed or freed.
* Return immediately.
*/
return err;
}
break;
case OVS_ACTION_ATTR_SET:
err = execute_set_action(skb, key, nla_data(a));
break;
case OVS_ACTION_ATTR_SET_MASKED:
case OVS_ACTION_ATTR_SET_TO_MASKED:
err = execute_masked_set_action(skb, key, nla_data(a));
break;
case OVS_ACTION_ATTR_SAMPLE:
err = sample(dp, skb, key, a, attr, len);
break;
case OVS_ACTION_ATTR_CT:
if (!is_flow_key_valid(key)) {
err = ovs_flow_key_update(skb, key);
if (err)
return err;
}
err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
nla_data(a));
/* Hide stolen IP fragments from user space. */
if (err)
return err == -EINPROGRESS ? 0 : err;
break;
}
if (unlikely(err)) {
kfree_skb(skb);
return err;
}
}
if (prev_port != -1)
do_output(dp, skb, prev_port, key);
else
consume_skb(skb);
return 0;
}
8、do_output函數
static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
struct sw_flow_key *key)
{
struct vport *vport = ovs_vport_rcu(dp, out_port); //獲取出端口的vport對象
if (likely(vport)) {
u16 mru = OVS_CB(skb)->mru;
if (likely(!mru || (skb->len <= mru + ETH_HLEN))) {
ovs_vport_send(vport, skb); //發送報文
} else if (mru <= vport->dev->mtu) {
__be16 ethertype = key->eth.type;
if (!is_flow_key_valid(key)) {
if (eth_p_mpls(skb->protocol))
ethertype = ovs_skb_get_inner_protocol(skb);
else
ethertype = vlan_get_protocol(skb);
}
ovs_fragment(vport, skb, mru, ethertype);
} else {
OVS_NLERR(true, "Cannot fragment IP frames");
kfree_skb(skb);
}
} else {
kfree_skb(skb);
}
}
9、ovs_vport_send函數
void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
int mtu = vport->dev->mtu;
if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
vport->dev->name,
packet_length(skb), mtu);
vport->dev->stats.tx_errors++;
goto drop;
}
skb->dev = vport->dev; //skb的dev設備,設置成vport關聯的netdev設備
vport->ops->send(skb); //調用vport對應vport_ops的send函數,如果是ovs_netdev_vport_ops,則調用內核的dev_queue_xmit函數,其他種類的vport後續分析
return;
drop:
kfree_skb(skb);
}
至此報文從進入到OVS到報文離開OVS已經全部完成,這裏只是最簡單的流程,後續逐步豐富之。
原文鏈接:https://blog.csdn.net/one_clouder/article/details/52388422