無論是內核態datapath還是基於dpdk的用戶態datapath,當flow table查不到之後都會進入upcall的處理(我喜歡管這條路徑叫做慢速路徑,那麼datapath裏就是快速路徑啦~~)
upcall的處理函數udpif_upcall_handler會在udpif_start_threads裏面初始化,同時創建的還有udpif_revalidator的線程
/* Starts the handler and revalidator threads, must be enclosed in
* ovsrcu quiescent state. */
static void
udpif_start_threads(struct udpif *udpif, size_t n_handlers,
size_t n_revalidators)
{
if (udpif && n_handlers && n_revalidators) {
size_t i;
bool enable_ufid;
udpif->n_handlers = n_handlers;
udpif->n_revalidators = n_revalidators;
udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers);
for (i = 0; i < udpif->n_handlers; i++) {
struct handler *handler = &udpif->handlers[i];
handler->udpif = udpif;
handler->handler_id = i;
handler->thread = ovs_thread_create(
"handler", udpif_upcall_handler, handler);
}
enable_ufid = ofproto_dpif_get_enable_ufid(udpif->backer);
atomic_init(&udpif->enable_ufid, enable_ufid);
dpif_enable_upcall(udpif->dpif);
ovs_barrier_init(&udpif->reval_barrier, udpif->n_revalidators);
ovs_barrier_init(&udpif->pause_barrier, udpif->n_revalidators + 1);
udpif->reval_exit = false;
udpif->pause = false;
udpif->revalidators = xzalloc(udpif->n_revalidators
* sizeof *udpif->revalidators);
for (i = 0; i < udpif->n_revalidators; i++) {
struct revalidator *revalidator = &udpif->revalidators[i];
revalidator->udpif = udpif;
revalidator->thread = ovs_thread_create(
"revalidator", udpif_revalidator, revalidator);
}
}
}
udpif_upcall_handler通過fd poll的方式等待觸發,如果有upcall上送,則進入recv_upcalls的處理函數中
先看下幾個相關的數據結構,struct udpif是和ofproto-dpif處理upcall相關的函數,分爲upcall處理和flow回收兩部分
/* An upcall handler for ofproto_dpif.
*
* udpif keeps records of two kind of logically separate units:
*
* upcall handling
* ---------------
*
* - An array of 'struct handler's for upcall handling and flow
* installation.
*
* flow revalidation
* -----------------
*
* - Revalidation threads which read the datapath flow table and maintains
* them.
*/
struct udpif {
struct ovs_list list_node; /* In all_udpifs list. */
struct dpif *dpif; /* Datapath handle. */
struct dpif_backer *backer; /* Opaque dpif_backer pointer. */
struct handler *handlers; /* Upcall handlers. */
size_t n_handlers;
struct revalidator *revalidators; /* Flow revalidators. */
size_t n_revalidators;
struct latch exit_latch; /* Tells child threads to exit. */
/* There are 'N_UMAPS' maps containing 'struct udpif_key' elements.
*
* During the flow dump phase, revalidators insert into these with a random
* distribution. During the garbage collection phase, each revalidator
* takes care of garbage collecting a slice of these maps. */
struct umap *ukeys;
};
struct umap是cuckoo hash實現的大規模hash表,用於通過udpif_keys查找datapath flow,struct udpif創建時一共會實現N_UMAPS個這樣的哈希表
struct dp_packet是實際報文的封裝,如果是在dpdk的dp下,會在mbuf後面的線性內存存放這些元數據
/* Buffer for holding packet data. A dp_packet is automatically reallocated
* as necessary if it grows too large for the available memory.
*/
struct dp_packet {
#ifdef DPDK_NETDEV
struct rte_mbuf mbuf; /* DPDK mbuf */
#else
void *base_; /* First byte of allocated space. */
uint16_t allocated_; /* Number of bytes allocated. */
uint16_t data_ofs; /* First byte actually in use. */
uint32_t size_; /* Number of bytes in use. */
uint32_t rss_hash; /* Packet hash. */
bool rss_hash_valid; /* Is the 'rss_hash' valid? */
#endif
enum dp_packet_source source; /* Source of memory allocated as 'base'. */
uint8_t l2_pad_size; /* Detected l2 padding size.
* Padding is non-pullable. */
uint16_t l2_5_ofs; /* MPLS label stack offset, or UINT16_MAX */
uint16_t l3_ofs; /* Network-level header offset,
* or UINT16_MAX. */
uint16_t l4_ofs; /* Transport-level header offset,
or UINT16_MAX. */
uint32_t cutlen; /* length in bytes to cut from the end. */
union {
struct pkt_metadata md;
uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
};
};
/* Datapath packet metadata */
struct pkt_metadata {
uint32_t recirc_id; /* Recirculation id carried with the
recirculating packets. 0 for packets
received from the wire. */
uint32_t dp_hash; /* hash value computed by the recirculation
action. */
uint32_t skb_priority; /* Packet priority for QoS. */
uint32_t pkt_mark; /* Packet mark. */
uint16_t ct_state; /* Connection state. */
uint16_t ct_zone; /* Connection zone. */
uint32_t ct_mark; /* Connection mark. */
ovs_u128 ct_label; /* Connection label. */
union flow_in_port in_port; /* Input port. */
struct flow_tnl tunnel; /* Encapsulating tunnel parameters. Note that
* if 'ip_dst' == 0, the rest of the fields may
* be uninitialized. */
};
/* Tunnel information used in flow key and metadata. */
struct flow_tnl {
ovs_be32 ip_dst;
struct in6_addr ipv6_dst;
ovs_be32 ip_src;
struct in6_addr ipv6_src;
ovs_be64 tun_id;
uint16_t flags;
uint8_t ip_tos;
uint8_t ip_ttl;
ovs_be16 tp_src;
ovs_be16 tp_dst;
ovs_be16 gbp_id;
uint8_t gbp_flags;
uint8_t pad1[5]; /* Pad to 64 bits. */
struct tun_metadata metadata;
};
struct dpif_upcall代表了一個報文的upcall,除了報文內容還有upcall帶上來的netlink屬性數據
/* A packet passed up from the datapath to userspace.
*
* The 'packet', 'key' and 'userdata' may point into data in a buffer
* provided by the caller, so the buffer should be released only after the
* upcall processing has been finished.
*
* While being processed, the 'packet' may be reallocated, so the packet must
* be separately released with ofpbuf_uninit().
*/
struct dpif_upcall {
/* All types. */
enum dpif_upcall_type type;
struct dp_packet packet; /* Packet data. */
struct nlattr *key; /* Flow key. */
size_t key_len; /* Length of 'key' in bytes. */
ovs_u128 ufid; /* Unique flow identifier for 'key'. */
struct nlattr *mru; /* Maximum receive unit. */
struct nlattr *cutlen; /* Number of bytes shrink from the end. */
/* DPIF_UC_ACTION only. */
struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
struct nlattr *out_tun_key; /* Output tunnel key. */
struct nlattr *actions; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
};
recv_upcalls會一次處理UPCALL_MAX_BATCH個請求,我們以單個請求的處理爲例子,首先調用的是dpif_recv,實際調用了dpif_class->recv註冊的函數。接收的數據會放到struct dpif_upcall和struct ofpbuf裏面
/* Polls for an upcall from 'dpif' for an upcall handler. Since there
* there can be multiple poll loops, 'handler_id' is needed as index to
* identify the corresponding poll loop. If successful, stores the upcall
* into '*upcall', using 'buf' for storage. Should only be called if
* 'recv_set' has been used to enable receiving packets from 'dpif'.
*
* 'upcall->key' and 'upcall->userdata' point into data in the caller-provided
* 'buf', so their memory cannot be freed separately from 'buf'.
*
* The caller owns the data of 'upcall->packet' and may modify it. If
* packet's headroom is exhausted as it is manipulated, 'upcall->packet'
* will be reallocated. This requires the data of 'upcall->packet' to be
* released with ofpbuf_uninit() before 'upcall' is destroyed. However,
* when an error is returned, the 'upcall->packet' may be uninitialized
* and should not be released.
*
* Returns 0 if successful, otherwise a positive errno value. Returns EAGAIN
* if no upcall is immediately available. */
int
dpif_recv(struct dpif *dpif, uint32_t handler_id, struct dpif_upcall *upcall,
struct ofpbuf *buf)
{
int error = EAGAIN;
if (dpif->dpif_class->recv) {
error = dpif->dpif_class->recv(dpif, handler_id, upcall, buf);
if (!error) {
dpif_print_packet(dpif, upcall);
} else if (error != EAGAIN) {
log_operation(dpif, "recv", error);
}
}
return error;
}
第二步是調用upcall_receive,該函數用於構造一個struct upcall結構體
static int
upcall_receive(struct upcall *upcall, const struct dpif_backer *backer,
const struct dp_packet *packet, enum dpif_upcall_type type,
const struct nlattr *userdata, const struct flow *flow,
const unsigned int mru,
const ovs_u128 *ufid, const unsigned pmd_id)
{
int error;
error = xlate_lookup(backer, flow, &upcall->ofproto, &upcall->ipfix,
&upcall->sflow, NULL, &upcall->in_port);
if (error) {
return error;
}
upcall->recirc = NULL;
upcall->have_recirc_ref = false;
upcall->flow = flow;
upcall->packet = packet;
upcall->ufid = ufid;
upcall->pmd_id = pmd_id;
upcall->type = type;
upcall->userdata = userdata;
ofpbuf_use_stub(&upcall->odp_actions, upcall->odp_actions_stub,
sizeof upcall->odp_actions_stub);
ofpbuf_init(&upcall->put_actions, 0);
upcall->xout_initialized = false;
upcall->ukey_persists = false;
upcall->ukey = NULL;
upcall->key = NULL;
upcall->key_len = 0;
upcall->mru = mru;
upcall->out_tun_key = NULL;
upcall->actions = NULL;
return 0;
}
/* Given a datapath and flow metadata ('backer', and 'flow' respectively),
* optionally populates 'ofproto' with the ofproto_dpif, 'ofp_in_port' with the
* openflow in_port, and 'ipfix', 'sflow', and 'netflow' with the appropriate
* handles for those protocols if they're enabled. Caller may use the returned
* pointers until quiescing, for longer term use additional references must
* be taken.
*
* Returns 0 if successful, ENODEV if the parsed flow has no associated ofproto.
*/
int
xlate_lookup(const struct dpif_backer *backer, const struct flow *flow,
struct ofproto_dpif **ofprotop, struct dpif_ipfix **ipfix,
struct dpif_sflow **sflow, struct netflow **netflow,
ofp_port_t *ofp_in_port)
最後調用process_upcall來處理這個struct upcall,根據upcall類型不同處理方式也不同,我們這裏只看MISS_UPCALL的處理,會調用到upcall_xlate
upcall_xlate首先初始化xlate_in, struct xlate_in結構體如下
struct xlate_in {
struct ofproto_dpif *ofproto;
/* Flow to which the OpenFlow actions apply. xlate_actions() will modify
* this flow when actions change header fields. */
struct flow flow;
/* The packet corresponding to 'flow', or a null pointer if we are
* revalidating without a packet to refer to. */
const struct dp_packet *packet;
/* Should OFPP_NORMAL update the MAC learning table? Should "learn"
* actions update the flow table?
*
* We want to update these tables if we are actually processing a packet,
* or if we are accounting for packets that the datapath has processed, but
* not if we are just revalidating. */
bool may_learn;
/* The rule initiating translation or NULL. If both 'rule' and 'ofpacts'
* are NULL, xlate_actions() will do the initial rule lookup itself. */
struct rule_dpif *rule;
/* The actions to translate. If 'rule' is not NULL, these may be NULL. */
const struct ofpact *ofpacts;
size_t ofpacts_len;
/* Union of the set of TCP flags seen so far in this flow. (Used only by
* NXAST_FIN_TIMEOUT. Set to zero to avoid updating updating rules'
* timeouts.) */
uint16_t tcp_flags;
/* If nonnull, flow translation calls this function just before executing a
* resubmit or OFPP_TABLE action. In addition, disables logging of traces
* when the recursion depth is exceeded.
*
* 'rule' is the rule being submitted into. It will be null if the
* resubmit or OFPP_TABLE action didn't find a matching rule.
*
* 'indentation' is the resubmit recursion depth at time of invocation,
* suitable for indenting the output.
*
* This is normally null so the client has to set it manually after
* calling xlate_in_init(). */
void (*resubmit_hook)(struct xlate_in *, struct rule_dpif *rule,
int indentation);
/* If nonnull, flow translation calls this function to report some
* significant decision, e.g. to explain why OFPP_NORMAL translation
* dropped a packet. 'indentation' is the resubmit recursion depth at time
* of invocation, suitable for indenting the output. */
void (*report_hook)(struct xlate_in *, int indentation,
const char *format, va_list args);
/* If nonnull, flow translation credits the specified statistics to each
* rule reached through a resubmit or OFPP_TABLE action.
*
* This is normally null so the client has to set it manually after
* calling xlate_in_init(). */
const struct dpif_flow_stats *resubmit_stats;
/* Counters carried over from a pre-existing translation of a related flow.
* This can occur due to, e.g., the translation of an ARP packet that was
* generated as the result of outputting to a tunnel port. In that case,
* the original flow going to the tunnel is the related flow. Since the
* two flows are different, they should not use the same xlate_ctx
* structure. However, we still need limit the maximum recursion across
* the entire translation.
*
* These fields are normally set to zero, so the client has to set them
* manually after calling xlate_in_init(). In that case, they should be
* copied from the same-named fields in the related flow's xlate_ctx.
*
* These fields are really implementation details; the client doesn't care
* about what they mean. See the corresponding fields in xlate_ctx for
* real documentation. */
int indentation;
int depth;
int resubmits;
/* If nonnull, flow translation populates this cache with references to all
* modules that are affected by translation. This 'xlate_cache' may be
* passed to xlate_push_stats() to perform the same function as
* xlate_actions() without the full cost of translation.
*
* This is normally null so the client has to set it manually after
* calling xlate_in_init(). */
struct xlate_cache *xcache;
/* If nonnull, flow translation puts the resulting datapath actions in this
* buffer. If null, flow translation will not produce datapath actions. */
struct ofpbuf *odp_actions;
/* If nonnull, flow translation populates this with wildcards relevant in
* translation. Any fields that were used to calculate the action are set,
* to allow caching and kernel wildcarding to work. For example, if the
* flow lookup involved performing the "normal" action on IPv4 and ARP
* packets, 'wc' would have the 'in_port' (always set), 'dl_type' (flow
* match), 'vlan_tci' (normal action), and 'dl_dst' (normal action) fields
* set. */
struct flow_wildcards *wc;
/* The frozen state to be resumed, as returned by xlate_lookup(). */
const struct frozen_state *frozen_state;
};
之後調用xlate_actions,生成datapath需要的struct xlate_out,xlate_actions函數比較複雜,其中最重要的調用是通過rule_dpif_lookup_from_table查找到匹配的流表規則,進而生成actions
rule_dpif_lookup_from_table又會通過流表的級聯一個個順序查找,每單個流表都會調用rule_dpif_lookup_in_table
/* Look up 'flow' in 'ofproto''s classifier version 'version', starting from
* table '*table_id'. Returns the rule that was found, which may be one of the
* special rules according to packet miss hadling. If 'may_packet_in' is
* false, returning of the miss_rule (which issues packet ins for the
* controller) is avoided. Updates 'wc', if nonnull, to reflect the fields
* that were used during the lookup.
*
* If 'honor_table_miss' is true, the first lookup occurs in '*table_id', but
* if none is found then the table miss configuration for that table is
* honored, which can result in additional lookups in other OpenFlow tables.
* In this case the function updates '*table_id' to reflect the final OpenFlow
* table that was searched.
*
* If 'honor_table_miss' is false, then only one table lookup occurs, in
* '*table_id'.
*
* The rule is returned in '*rule', which is valid at least until the next
* RCU quiescent period. If the '*rule' needs to stay around longer, the
* caller must take a reference.
*
* 'in_port' allows the lookup to take place as if the in port had the value
* 'in_port'. This is needed for resubmit action support.
*
* 'flow' is non-const to allow for temporary modifications during the lookup.
* Any changes are restored before returning. */
struct rule_dpif *
rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto,
ovs_version_t version, struct flow *flow,
struct flow_wildcards *wc,
const struct dpif_flow_stats *stats,
uint8_t *table_id, ofp_port_t in_port,
bool may_packet_in, bool honor_table_miss)
{
ovs_be16 old_tp_src = flow->tp_src, old_tp_dst = flow->tp_dst;
ofp_port_t old_in_port = flow->in_port.ofp_port;
enum ofputil_table_miss miss_config;
struct rule_dpif *rule;
uint8_t next_id;
/* We always unwildcard nw_frag (for IP), so they
* need not be unwildcarded here. */
if (flow->nw_frag & FLOW_NW_FRAG_ANY
&& ofproto->up.frag_handling != OFPUTIL_FRAG_NX_MATCH) {
if (ofproto->up.frag_handling == OFPUTIL_FRAG_NORMAL) {
/* We must pretend that transport ports are unavailable. */
flow->tp_src = htons(0);
flow->tp_dst = htons(0);
} else {
/* Must be OFPUTIL_FRAG_DROP (we don't have OFPUTIL_FRAG_REASM).
* Use the drop_frags_rule (which cannot disappear). */
rule = ofproto->drop_frags_rule;
if (stats) {
struct oftable *tbl = &ofproto->up.tables[*table_id];
unsigned long orig;
atomic_add_relaxed(&tbl->n_matched, stats->n_packets, &orig);
}
return rule;
}
}
/* Look up a flow with 'in_port' as the input port. Then restore the
* original input port (otherwise OFPP_NORMAL and OFPP_IN_PORT will
* have surprising behavior). */
flow->in_port.ofp_port = in_port;
/* Our current implementation depends on n_tables == N_TABLES, and
* TBL_INTERNAL being the last table. */
BUILD_ASSERT_DECL(N_TABLES == TBL_INTERNAL + 1);
miss_config = OFPUTIL_TABLE_MISS_CONTINUE;
for (next_id = *table_id;
next_id < ofproto->up.n_tables;
next_id++, next_id += (next_id == TBL_INTERNAL))
{
*table_id = next_id;
rule = rule_dpif_lookup_in_table(ofproto, version, next_id, flow, wc);
if (stats) {
struct oftable *tbl = &ofproto->up.tables[next_id];
unsigned long orig;
atomic_add_relaxed(rule ? &tbl->n_matched : &tbl->n_missed,
stats->n_packets, &orig);
}
if (rule) {
goto out; /* Match. */
}
if (honor_table_miss) {
miss_config = ofproto_table_get_miss_config(&ofproto->up,
*table_id);
if (miss_config == OFPUTIL_TABLE_MISS_CONTINUE) {
continue;
}
}
break;
}
/* Miss. */
rule = ofproto->no_packet_in_rule;
if (may_packet_in) {
if (miss_config == OFPUTIL_TABLE_MISS_CONTINUE
|| miss_config == OFPUTIL_TABLE_MISS_CONTROLLER) {
struct ofport_dpif *port;
port = ofp_port_to_ofport(ofproto, old_in_port);
if (!port) {
VLOG_WARN_RL(&rl, "packet-in on unknown OpenFlow port %"PRIu16,
old_in_port);
} else if (!(port->up.pp.config & OFPUTIL_PC_NO_PACKET_IN)) {
rule = ofproto->miss_rule;
}
} else if (miss_config == OFPUTIL_TABLE_MISS_DEFAULT &&
connmgr_wants_packet_in_on_miss(ofproto->up.connmgr)) {
rule = ofproto->miss_rule;
}
}
out:
/* Restore port numbers, as they may have been modified above. */
flow->tp_src = old_tp_src;
flow->tp_dst = old_tp_dst;
/* Restore the old in port. */
flow->in_port.ofp_port = old_in_port;
return rule;
}
而對於rule_dpif_lookup_in_table而言,實際調用了classifier_lookup來在流表中查找rule,struct classifier的細節後面再分析
/* Finds and returns the highest-priority rule in 'cls' that matches 'flow' and
* that is visible in 'version'. Returns a null pointer if no rules in 'cls'
* match 'flow'. If multiple rules of equal priority match 'flow', returns one
* arbitrarily.
*
* If a rule is found and 'wc' is non-null, bitwise-OR's 'wc' with the
* set of bits that were significant in the lookup. At some point
* earlier, 'wc' should have been initialized (e.g., by
* flow_wildcards_init_catchall()).
*
* 'flow' is non-const to allow for temporary modifications during the lookup.
* Any changes are restored before returning. */
const struct cls_rule *
classifier_lookup(const struct classifier *cls, ovs_version_t version,
struct flow *flow, struct flow_wildcards *wc)
{
return classifier_lookup__(cls, version, flow, wc, true);
}
xlate_actions最終調用do_xlate_actions針對每種ACTION_ATTR對flow執行不同操作。
好了,下面我們回到recv_upcalls了,最後會調用handle_upcalls,用於向datapath下發flow,handle_upcalls最終調用的是dpif_operate來下發flow,後者調用的是dpif_class->operate,該接口針對不同的dpif實現,可以是dpif_netdev_operate或者dpif_netlink_operate