openvswitch用戶態數據包處理流程

內核匹配不到流表通過netlink發送到用戶態處理。

用戶態 udpif_upcall_handler線程接收內核發送的upcall信息。

創建udpif_upcall_handler流程：

Main（ovs-vswitchd.c）---> bridge_run ---> bridge_reconfigure ---> ofproto_create--->construct ---> open_dpif_backer --->udpif_create--->udpif_set_threads--->udpif_start_threads---> udpif_upcall_handler

udpif_upcall_handler接收upcall流程：

udpif_upcall_handler ---> recv_upcalls---> process_upcall ---> upcall_xlate ---> xlate_actions ---> do_xlate_actions---> handle_upcalls

udpif_upcall_handler 線程會循環接收upcall請求。

static void *
udpif_upcall_handler(void *arg)
{
   struct handler *handler = arg;
   struct udpif *udpif = handler->udpif;
 
   while (!latch_is_set(&handler->udpif->exit_latch)) {
       if (recv_upcalls(handler)) {
           poll_immediate_wake();
       } else {
           dpif_recv_wait(udpif->dpif, handler->handler_id);
           latch_wait(&udpif->exit_latch);
       }
       poll_block();
    }
   return NULL;
}

接收並處理upcall請求

static size_t recv_upcalls(struct handler*handler)

調用upcall_receive讀取upcall調用信息，調用flow_extract提取flow信息。

static size_t
recv_upcalls(struct handler *handler)
{
    struct udpif *udpif = handler->udpif;
    uint64_t recv_stubs[UPCALL_MAX_BATCH][512 / 8];
    struct ofpbuf recv_bufs[UPCALL_MAX_BATCH];
    struct dpif_upcall dupcalls[UPCALL_MAX_BATCH];
    struct upcall upcalls[UPCALL_MAX_BATCH];
    struct flow flows[UPCALL_MAX_BATCH];
    size_t n_upcalls, i;

    n_upcalls = 0;
    while (n_upcalls < UPCALL_MAX_BATCH) {
        struct ofpbuf *recv_buf = &recv_bufs[n_upcalls];
        struct dpif_upcall *dupcall = &dupcalls[n_upcalls];
        struct upcall *upcall = &upcalls[n_upcalls];
        struct flow *flow = &flows[n_upcalls];
        unsigned int mru;
        int error;

        ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls],
                        sizeof recv_stubs[n_upcalls]);
        if (dpif_recv(udpif->dpif, handler->handler_id, dupcall, recv_buf)) {
            ofpbuf_uninit(recv_buf);
            break;
        }

        if (odp_flow_key_to_flow(dupcall->key, dupcall->key_len, flow)
            == ODP_FIT_ERROR) {
            goto free_dupcall;
        }

        if (dupcall->mru) {
            mru = nl_attr_get_u16(dupcall->mru);
        } else {
            mru = 0;
        }

        error = upcall_receive(upcall, udpif->backer, &dupcall->packet,
                               dupcall->type, dupcall->userdata, flow, mru,
                               &dupcall->ufid, PMD_ID_NULL);
        if (error) {
            if (error == ENODEV) {
                /* Received packet on datapath port for which we couldn't
                 * associate an ofproto.  This can happen if a port is removed
                 * while traffic is being received.  Print a rate-limited
                 * message in case it happens frequently. */
                dpif_flow_put(udpif->dpif, DPIF_FP_CREATE, dupcall->key,
                              dupcall->key_len, NULL, 0, NULL, 0,
                              &dupcall->ufid, PMD_ID_NULL, NULL);
                VLOG_INFO_RL(&rl, "received packet on unassociated datapath "
                             "port %"PRIu32, flow->in_port.odp_port);
            }
            goto free_dupcall;
        }

        upcall->key = dupcall->key;
        upcall->key_len = dupcall->key_len;
        upcall->ufid = &dupcall->ufid;

        upcall->out_tun_key = dupcall->out_tun_key;
        upcall->actions = dupcall->actions;

        pkt_metadata_from_flow(&dupcall->packet.md, flow);
        flow_extract(&dupcall->packet, flow);

        error = process_upcall(udpif, upcall,
                               &upcall->odp_actions, &upcall->wc);
        if (error) {
            goto cleanup;
        }

        n_upcalls++;
        continue;

cleanup:
        upcall_uninit(upcall);
free_dupcall:
        dp_packet_uninit(&dupcall->packet);
        ofpbuf_uninit(recv_buf);
    }

    if (n_upcalls) {
        handle_upcalls(handler->udpif, upcalls, n_upcalls);
        for (i = 0; i < n_upcalls; i++) {
            dp_packet_uninit(&dupcalls[i].packet);
            ofpbuf_uninit(&recv_bufs[i]);
            upcall_uninit(&upcalls[i]);
        }
    }

    return n_upcalls;
}

調用process_upcall處理upcall信息，upcall類型分爲

enum upcall_type {

BAD_UPCALL, /*Some kind of bug somewhere. */

MISS_UPCALL, /* Aflow miss. */

SFLOW_UPCALL, /*sFlow sample. */

FLOW_SAMPLE_UPCALL, /*Per-flow sampling. */

IPFIX_UPCALL /*Per-bridge sampling. */

};

內核datapath未匹配的類型爲MISS_UPCALL。

調用xlate_actions 進行動作處理，此函數會調用rule_dpif_lookup_from_table 在flow table 中進行規則查找，查找到規則後調用do_xlate_actions，根據action的不同執行不同的操作。

enum xlate_error
xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
{
    *xout = (struct xlate_out) {
        .slow = 0,
        .recircs = RECIRC_REFS_EMPTY_INITIALIZER,
    };

    struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
    struct xbridge *xbridge = xbridge_lookup(xcfg, xin->ofproto);
    if (!xbridge) {
        return XLATE_BRIDGE_NOT_FOUND;
    }

    struct flow *flow = &xin->flow;

    uint8_t stack_stub[1024];
    uint64_t action_set_stub[1024 / 8];
    uint64_t frozen_actions_stub[1024 / 8];
    uint64_t actions_stub[256 / 8];
    struct ofpbuf scratch_actions = OFPBUF_STUB_INITIALIZER(actions_stub);
    struct xlate_ctx ctx = {
        .xin = xin,
        .xout = xout,
        .base_flow = *flow,
        .orig_tunnel_ipv6_dst = flow_tnl_dst(&flow->tunnel),
        .xbridge = xbridge,
        .stack = OFPBUF_STUB_INITIALIZER(stack_stub),
        .rule = xin->rule,
        .wc = (xin->wc
               ? xin->wc
               : &(struct flow_wildcards) { .masks = { .dl_type = 0 } }),
        .odp_actions = xin->odp_actions ? xin->odp_actions : &scratch_actions,

        .depth = xin->depth,
        .resubmits = xin->resubmits,
        .in_group = false,
        .in_action_set = false,

        .table_id = 0,
        .rule_cookie = OVS_BE64_MAX,
        .orig_skb_priority = flow->skb_priority,
        .sflow_n_outputs = 0,
        .sflow_odp_port = 0,
        .nf_output_iface = NF_OUT_DROP,
        .exit = false,
        .error = XLATE_OK,
        .mirrors = 0,

        .freezing = false,
        .recirc_update_dp_hash = false,
        .frozen_actions = OFPBUF_STUB_INITIALIZER(frozen_actions_stub),
        .pause = NULL,

        .was_mpls = false,
        .conntracked = false,

        .ct_nat_action = NULL,

        .action_set_has_group = false,
        .action_set = OFPBUF_STUB_INITIALIZER(action_set_stub),
    };

    /* 'base_flow' reflects the packet as it came in, but we need it to reflect
     * the packet as the datapath will treat it for output actions. Our
     * datapath doesn't retain tunneling information without us re-setting
     * it, so clear the tunnel data.
     */

    memset(&ctx.base_flow.tunnel, 0, sizeof ctx.base_flow.tunnel);

    ofpbuf_reserve(ctx.odp_actions, NL_A_U32_SIZE);
    xlate_wc_init(&ctx);

    COVERAGE_INC(xlate_actions);

    xin->trace = xlate_report(&ctx, OFT_BRIDGE, "bridge(\"%s\")",
                              xbridge->name);
    if (xin->frozen_state) {
        const struct frozen_state *state = xin->frozen_state;

        struct ovs_list *old_trace = xin->trace;
        xin->trace = xlate_report(&ctx, OFT_THAW, "thaw");

        if (xin->ofpacts_len > 0 || ctx.rule) {
            xlate_report_error(&ctx, "Recirculation conflict (%s)!",
                               xin->ofpacts_len ? "actions" : "rule");
            ctx.error = XLATE_RECIRCULATION_CONFLICT;
            goto exit;
        }

        /* Set the bridge for post-recirculation processing if needed. */
        if (!uuid_equals(&ctx.xbridge->ofproto->uuid, &state->ofproto_uuid)) {
            struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
            const struct xbridge *new_bridge
                = xbridge_lookup_by_uuid(xcfg, &state->ofproto_uuid);

            if (OVS_UNLIKELY(!new_bridge)) {
                /* Drop the packet if the bridge cannot be found. */
                xlate_report_error(&ctx, "Frozen bridge no longer exists.");
                ctx.error = XLATE_BRIDGE_NOT_FOUND;
                xin->trace = old_trace;
                goto exit;
            }
            ctx.xbridge = new_bridge;
            /* The bridge is now known so obtain its table version. */
            ctx.xin->tables_version
                = ofproto_dpif_get_tables_version(ctx.xbridge->ofproto);
        }

        /* Set the thawed table id.  Note: A table lookup is done only if there
         * are no frozen actions. */
        ctx.table_id = state->table_id;
        xlate_report(&ctx, OFT_THAW,
                     "Resuming from table %"PRIu8, ctx.table_id);

        if (!state->conntracked) {
            clear_conntrack(&ctx);
        }

        /* Restore pipeline metadata. May change flow's in_port and other
         * metadata to the values that existed when freezing was triggered. */
        frozen_metadata_to_flow(&state->metadata, flow);

        /* Restore stack, if any. */
        if (state->stack) {
            ofpbuf_put(&ctx.stack, state->stack, state->stack_size);
        }

        /* Restore mirror state. */
        ctx.mirrors = state->mirrors;

        /* Restore action set, if any. */
        if (state->action_set_len) {
            xlate_report_actions(&ctx, OFT_THAW, "Restoring action set",
                                 state->action_set, state->action_set_len);

            flow->actset_output = OFPP_UNSET;
            xlate_write_actions__(&ctx, state->action_set,
                                  state->action_set_len);
        }

        /* Restore frozen actions.  If there are no actions, processing will
         * start with a lookup in the table set above. */
        xin->ofpacts = state->ofpacts;
        xin->ofpacts_len = state->ofpacts_len;
        if (state->ofpacts_len) {
            xlate_report_actions(&ctx, OFT_THAW, "Restoring actions",
                                 xin->ofpacts, xin->ofpacts_len);
        }

        xin->trace = old_trace;
    } else if (OVS_UNLIKELY(flow->recirc_id)) {
        xlate_report_error(&ctx,
                           "Recirculation context not found for ID %"PRIx32,
                           flow->recirc_id);
        ctx.error = XLATE_NO_RECIRCULATION_CONTEXT;
        goto exit;
    }

    /* Tunnel metadata in udpif format must be normalized before translation. */
    if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
        const struct tun_table *tun_tab = ofproto_get_tun_tab(
            &ctx.xbridge->ofproto->up);
        int err;

        err = tun_metadata_from_geneve_udpif(tun_tab, &xin->upcall_flow->tunnel,
                                             &xin->upcall_flow->tunnel,
                                             &flow->tunnel);
        if (err) {
            xlate_report_error(&ctx, "Invalid Geneve tunnel metadata");
            ctx.error = XLATE_INVALID_TUNNEL_METADATA;
            goto exit;
        }
    } else if (!flow->tunnel.metadata.tab) {
        /* If the original flow did not come in on a tunnel, then it won't have
         * FLOW_TNL_F_UDPIF set. However, we still need to have a metadata
         * table in case we generate tunnel actions. */
        flow->tunnel.metadata.tab = ofproto_get_tun_tab(
            &ctx.xbridge->ofproto->up);
    }
    ctx.wc->masks.tunnel.metadata.tab = flow->tunnel.metadata.tab;

    if (!xin->ofpacts && !ctx.rule) {
        ctx.rule = rule_dpif_lookup_from_table(
            ctx.xbridge->ofproto, ctx.xin->tables_version, flow, ctx.wc,
            ctx.xin->resubmit_stats, &ctx.table_id,
            flow->in_port.ofp_port, true, true, ctx.xin->xcache);
        if (ctx.xin->resubmit_stats) {
            rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats);
        }
        if (ctx.xin->xcache) {
            struct xc_entry *entry;

            entry = xlate_cache_add_entry(ctx.xin->xcache, XC_RULE);
            entry->rule = ctx.rule;
            ofproto_rule_ref(&ctx.rule->up);
        }

        xlate_report_table(&ctx, ctx.rule, ctx.table_id);
    }

    /* Get the proximate input port of the packet.  (If xin->frozen_state,
     * flow->in_port is the ultimate input port of the packet.) */
    struct xport *in_port = get_ofp_port(xbridge,
                                         ctx.base_flow.in_port.ofp_port);

    /* Tunnel stats only for not-thawed packets. */
    if (!xin->frozen_state && in_port && in_port->is_tunnel) {
        if (ctx.xin->resubmit_stats) {
            netdev_vport_inc_rx(in_port->netdev, ctx.xin->resubmit_stats);
            if (in_port->bfd) {
                bfd_account_rx(in_port->bfd, ctx.xin->resubmit_stats);
            }
        }
        if (ctx.xin->xcache) {
            struct xc_entry *entry;

            entry = xlate_cache_add_entry(ctx.xin->xcache, XC_NETDEV);
            entry->dev.rx = netdev_ref(in_port->netdev);
            entry->dev.bfd = bfd_ref(in_port->bfd);
        }
    }

    if (!xin->frozen_state && process_special(&ctx, in_port)) {
        /* process_special() did all the processing for this packet.
         *
         * We do not perform special processing on thawed packets, since that
         * was done before they were frozen and should not be redone. */
    } else if (in_port && in_port->xbundle
               && xbundle_mirror_out(xbridge, in_port->xbundle)) {
        xlate_report_error(&ctx, "dropping packet received on port "
                           "%s, which is reserved exclusively for mirroring",
                           in_port->xbundle->name);
    } else {
        /* Sampling is done on initial reception; don't redo after thawing. */
        unsigned int user_cookie_offset = 0;
        if (!xin->frozen_state) {
            user_cookie_offset = compose_sflow_action(&ctx);
            compose_ipfix_action(&ctx, ODPP_NONE);
        }
        size_t sample_actions_len = ctx.odp_actions->size;

        if (tnl_process_ecn(flow)
            && (!in_port || may_receive(in_port, &ctx))) {
            const struct ofpact *ofpacts;
            size_t ofpacts_len;

            if (xin->ofpacts) {
                ofpacts = xin->ofpacts;
                ofpacts_len = xin->ofpacts_len;
            } else if (ctx.rule) {
                const struct rule_actions *actions
                    = rule_get_actions(&ctx.rule->up);
                ofpacts = actions->ofpacts;
                ofpacts_len = actions->ofpacts_len;
                ctx.rule_cookie = ctx.rule->up.flow_cookie;
            } else {
                OVS_NOT_REACHED();
            }

            mirror_ingress_packet(&ctx);
            do_xlate_actions(ofpacts, ofpacts_len, &ctx);
            if (ctx.error) {
                goto exit;
            }

            /* We've let OFPP_NORMAL and the learning action look at the
             * packet, so cancel all actions and freezing if forwarding is
             * disabled. */
            if (in_port && (!xport_stp_forward_state(in_port) ||
                            !xport_rstp_forward_state(in_port))) {
                ctx.odp_actions->size = sample_actions_len;
                ctx_cancel_freeze(&ctx);
                ofpbuf_clear(&ctx.action_set);
            }

            if (!ctx.freezing) {
                xlate_action_set(&ctx);
            }
            if (ctx.freezing) {
                finish_freezing(&ctx);
            }
        }

        /* Output only fully processed packets. */
        if (!ctx.freezing
            && xbridge->has_in_band
            && in_band_must_output_to_local_port(flow)
            && !actions_output_to_local_port(&ctx)) {
            compose_output_action(&ctx, OFPP_LOCAL, NULL);
        }

        if (user_cookie_offset) {
            fix_sflow_action(&ctx, user_cookie_offset);
        }
    }

    if (nl_attr_oversized(ctx.odp_actions->size)) {
        /* These datapath actions are too big for a Netlink attribute, so we
         * can't hand them to the kernel directly.  dpif_execute() can execute
         * them one by one with help, so just mark the result as SLOW_ACTION to
         * prevent the flow from being installed. */
        COVERAGE_INC(xlate_actions_oversize);
        ctx.xout->slow |= SLOW_ACTION;
    } else if (too_many_output_actions(ctx.odp_actions)) {
        COVERAGE_INC(xlate_actions_too_many_output);
        ctx.xout->slow |= SLOW_ACTION;
    }

    /* Do netflow only for packets on initial reception, that are not sent to
     * the controller.  We consider packets sent to the controller to be part
     * of the control plane rather than the data plane. */
    if (!xin->frozen_state
        && xbridge->netflow
        && !(xout->slow & SLOW_CONTROLLER)) {
        if (ctx.xin->resubmit_stats) {
            netflow_flow_update(xbridge->netflow, flow,
                                ctx.nf_output_iface,
                                ctx.xin->resubmit_stats);
        }
        if (ctx.xin->xcache) {
            struct xc_entry *entry;

            entry = xlate_cache_add_entry(ctx.xin->xcache, XC_NETFLOW);
            entry->nf.netflow = netflow_ref(xbridge->netflow);
            entry->nf.flow = xmemdup(flow, sizeof *flow);
            entry->nf.iface = ctx.nf_output_iface;
        }
    }

    /* Translate tunnel metadata masks to udpif format if necessary. */
    if (xin->upcall_flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
        if (ctx.wc->masks.tunnel.metadata.present.map) {
            const struct flow_tnl *upcall_tnl = &xin->upcall_flow->tunnel;
            struct geneve_opt opts[TLV_TOT_OPT_SIZE /
                                   sizeof(struct geneve_opt)];

            tun_metadata_to_geneve_udpif_mask(&flow->tunnel,
                                              &ctx.wc->masks.tunnel,
                                              upcall_tnl->metadata.opts.gnv,
                                              upcall_tnl->metadata.present.len,
                                              opts);
             memset(&ctx.wc->masks.tunnel.metadata, 0,
                    sizeof ctx.wc->masks.tunnel.metadata);
             memcpy(&ctx.wc->masks.tunnel.metadata.opts.gnv, opts,
                    upcall_tnl->metadata.present.len);
        }
        ctx.wc->masks.tunnel.metadata.present.len = 0xff;
        ctx.wc->masks.tunnel.metadata.tab = NULL;
        ctx.wc->masks.tunnel.flags |= FLOW_TNL_F_UDPIF;
    } else if (!xin->upcall_flow->tunnel.metadata.tab) {
        /* If we didn't have options in UDPIF format and didn't have an existing
         * metadata table, then it means that there were no options at all when
         * we started processing and any wildcards we picked up were from
         * action generation. Without options on the incoming packet, wildcards
         * aren't meaningful. To avoid them possibly getting misinterpreted,
         * just clear everything. */
        if (ctx.wc->masks.tunnel.metadata.present.map) {
            memset(&ctx.wc->masks.tunnel.metadata, 0,
                   sizeof ctx.wc->masks.tunnel.metadata);
        } else {
            ctx.wc->masks.tunnel.metadata.tab = NULL;
        }
    }

    xlate_wc_finish(&ctx);

exit:
    /* Reset the table to what it was when we came in. If we only fetched
     * it locally, then it has no meaning outside of flow translation. */
    flow->tunnel.metadata.tab = xin->upcall_flow->tunnel.metadata.tab;

    ofpbuf_uninit(&ctx.stack);
    ofpbuf_uninit(&ctx.action_set);
    ofpbuf_uninit(&ctx.frozen_actions);
    ofpbuf_uninit(&scratch_actions);

    /* Make sure we return a "drop flow" in case of an error. */
    if (ctx.error) {
        xout->slow = 0;
        if (xin->odp_actions) {
            ofpbuf_clear(xin->odp_actions);
        }
    }
    return ctx.error;
}

handle_upcalls將flow規則下刷到內核的datapath中，調用dpif_netlink_operate() 修改內核中的datapath規則。

static void
handle_upcalls(struct udpif *udpif, struct upcall *upcalls,
               size_t n_upcalls)
{
    struct dpif_op *opsp[UPCALL_MAX_BATCH * 2];
    struct ukey_op ops[UPCALL_MAX_BATCH * 2];
    size_t n_ops, n_opsp, i;

    /* Handle the packets individually in order of arrival.
     *
     *   - For SLOW_CFM, SLOW_LACP, SLOW_STP, and SLOW_BFD, translation is what
     *     processes received packets for these protocols.
     *
     *   - For SLOW_CONTROLLER, translation sends the packet to the OpenFlow
     *     controller.
     *
     * The loop fills 'ops' with an array of operations to execute in the
     * datapath. */
    n_ops = 0;
    for (i = 0; i < n_upcalls; i++) {
        struct upcall *upcall = &upcalls[i];
        const struct dp_packet *packet = upcall->packet;
        struct ukey_op *op;

        if (should_install_flow(udpif, upcall)) {
            struct udpif_key *ukey = upcall->ukey;

            if (ukey_install(udpif, ukey)) {
                upcall->ukey_persists = true;
                put_op_init(&ops[n_ops++], ukey, DPIF_FP_CREATE);
            }
        }

        if (upcall->odp_actions.size) {
            op = &ops[n_ops++];
            op->ukey = NULL;
            op->dop.type = DPIF_OP_EXECUTE;
            op->dop.u.execute.packet = CONST_CAST(struct dp_packet *, packet);
            op->dop.u.execute.flow = upcall->flow;
            odp_key_to_pkt_metadata(upcall->key, upcall->key_len,
                                    &op->dop.u.execute.packet->md);
            op->dop.u.execute.actions = upcall->odp_actions.data;
            op->dop.u.execute.actions_len = upcall->odp_actions.size;
            op->dop.u.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) != 0;
            op->dop.u.execute.probe = false;
            op->dop.u.execute.mtu = upcall->mru;
        }
    }

    /* Execute batch. */
    n_opsp = 0;
    for (i = 0; i < n_ops; i++) {
        opsp[n_opsp++] = &ops[i].dop;
    }
    dpif_operate(udpif->dpif, opsp, n_opsp);
    for (i = 0; i < n_ops; i++) {
        struct udpif_key *ukey = ops[i].ukey;

        if (ukey) {
            ovs_mutex_lock(&ukey->mutex);
            if (ops[i].dop.error) {
                transition_ukey(ukey, UKEY_EVICTED);
            } else {
                transition_ukey(ukey, UKEY_OPERATIONAL);
            }
            ovs_mutex_unlock(&ukey->mutex);
        }
    }
}

openvswitch用戶態數據包處理流程

ovs + dpdk 定位配置ovs端口後ovs-vswitchd進程掛死問題的總結

open vswitch分析

利用 TC 給openstack neutron做帶寬限制

openvswitch用戶態數據包處理流程

virtio 模式下不支持ethtool -G 命令分析

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結