OVS源碼--vxlan發包流程分析(十)

發包處理函數最終會調用到ovs_vport_send函數,該函數最終會調用vport_ops的send函數。

1、ovs_vport_send函數
void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
{
	int mtu = vport->dev->mtu;
 
	if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
		net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
				     vport->dev->name,
				     packet_length(skb), mtu);
		vport->dev->stats.tx_errors++;
		goto drop;
	}
 
	skb->dev = vport->dev;//vport關聯的設備,vxlan端口的設備爲vxlan_4789
	vport->ops->send(skb);//實際調用ovs_vxlan_netdev_vport_ops的vxlan_xmit函數
	return;
 
drop:
	kfree_skb(skb);
}
2、vxlan_xmit函數
#define vxlan_xmit rpl_vxlan_xmit
//在3.18內核中,OVS還是使用自己的vxlan實現
netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;//ovs_vport_send函數中完成設置
	struct vxlan_dev *vxlan = netdev_priv(dev);//該信息在創建vxlan端口對應的net_device時就初始化了
	const struct ip_tunnel_info *info;
 
	info = skb_tunnel_info(skb);    //得到tunnel信息,即execute_set_action函數設置的內容
 
	skb_reset_mac_header(skb);
 
	if ((vxlan->flags & VXLAN_F_PROXY))//當前沒有此標記
		goto out;
 
	if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
	    info && info->mode & IP_TUNNEL_INFO_TX) {
		vxlan_xmit_one(skb, dev, NULL, false);//發送報文
		return NETDEV_TX_OK;
	}
out:
	pr_warn("vxlan: unsupported flag set %x", vxlan->flags);
	kfree_skb(skb);
	return NETDEV_TX_OK;
}
3、vxlan_xmit_one函數
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
			   struct vxlan_rdst *rdst, bool did_rsc)
{
	struct ip_tunnel_info *info;
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct sock *sk = vxlan->vn_sock->sock->sk;
	unsigned short family = vxlan_get_sk_family(vxlan->vn_sock);	//通過sock判斷是IPV4還是IPV6
	struct rtable *rt = NULL;
	const struct iphdr *old_iph;
	struct flowi4 fl4;
	union vxlan_addr *dst;
	union vxlan_addr remote_ip;
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
	__be16 src_port = 0, dst_port;
	u32 vni;
	__be16 df = 0;
	__u8 tos, ttl;
	int err;
	u32 flags = vxlan->flags;
 
	info = skb_tunnel_info(skb);		//從skb中獲取tunnel info
 
	if (rdst) {		//不進入此分支
		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
		vni = rdst->remote_vni;
		dst = &rdst->remote_ip;
	} else {
		if (!info) {		//說明當前實現,對於報文從vxlan端口出去,必須設置tunnel info
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
				  dev->name);
			goto drop;
		}
		if (family != ip_tunnel_info_af(info))		//判斷協議類型是否一致
			goto drop;
 
		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;	//目的端口優先從tunnel info中獲取
		vni = be64_to_cpu(info->key.tun_id);			//VNI信息從tunnel info中獲取
		remote_ip.sa.sa_family = family;
		if (family == AF_INET)
			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;	//目的IP地址從tunnel info中獲取
		else
			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;	//目的IP地址從tunnel info中獲取
		dst = &remote_ip;
	}
 
	if (vxlan_addr_any(dst)) {		//目的IP地址爲全零,當前不支持
		if (did_rsc) {
			/* short-circuited back to local bridge */
			WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
				  dev->name);
		}
		goto drop;
	}
 
	old_iph = ip_hdr(skb);			//skb的IP頭
 
	ttl = vxlan->cfg.ttl;			//獲取vxlan配置中的ttl
	if (!ttl && vxlan_addr_multicast(dst))
		ttl = 1;
 
	tos = vxlan->cfg.tos;			//獲取vxlan配置中的tos
	if (tos == 1)
		tos = ip_tunnel_get_dsfield(old_iph, skb);
 
	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,	//計算源端口
				     vxlan->cfg.port_max, true);
 
	if (info) {
		if (info->key.tun_flags & TUNNEL_CSUM)
			flags |= VXLAN_F_UDP_CSUM;
		else
			flags &= ~VXLAN_F_UDP_CSUM;
 
		ttl = info->key.ttl;		//優先使用tunnel info中的ttl
		tos = info->key.tos;		//優先使用tunnel info中的tos
 
		if (info->options_len)
			md = ip_tunnel_info_opts(info);
	} else {
		md->gbp = skb->mark;
	}
 
	if (dst->sa.sa_family == AF_INET) {		//如果是IPV4
		if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT))
			df = htons(IP_DF);
 
		memset(&fl4, 0, sizeof(fl4));
		fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
		fl4.flowi4_tos = RT_TOS(tos);
		fl4.flowi4_mark = skb->mark;
		fl4.flowi4_proto = IPPROTO_UDP;
		fl4.daddr = dst->sin.sin_addr.s_addr;
		fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr;
 
		rt = ip_route_output_key(vxlan->net, &fl4);		//路由表查找
		if (IS_ERR(rt)) {
			netdev_dbg(dev, "no route to %pI4\n",
				   &dst->sin.sin_addr.s_addr);
			dev->stats.tx_carrier_errors++;
			goto tx_error;
		}
 
		if (rt_dst(rt).dev == dev) {
			netdev_dbg(dev, "circular route to %pI4\n",
				   &dst->sin.sin_addr.s_addr);
			dev->stats.collisions++;
			goto rt_tx_error;
		}
 
		/* Bypass encapsulation if the destination is local */
		if (rt->rt_flags & RTCF_LOCAL &&
		    !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {	//正常場景不進入此分支
			struct vxlan_dev *dst_vxlan;
 
			ip_rt_put(rt);
			dst_vxlan = vxlan_find_vni(vxlan->net, vni,
						   dst->sa.sa_family, dst_port,
						   vxlan->flags);
			if (!dst_vxlan)
				goto tx_error;
			WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
				  dev->name);
			goto tx_error;
		}
 
		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);		
		ttl = ttl ? : ip4_dst_hoplimit(&rt_dst(rt));		//如果ttl爲零,則從rt表項中獲取
		err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,		//發送報文
				     dst->sin.sin_addr.s_addr, tos, ttl, df,
				     src_port, dst_port, htonl(vni << 8), md,
				     !net_eq(vxlan->net, dev_net(vxlan->dev)),
				     flags);
		if (err < 0) {
			/* skb is already freed. */
			skb = NULL;
			goto rt_tx_error;
		}
 
		iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
#if IS_ENABLED(CONFIG_IPV6)
	} else {			//IPV6情況,暫不分析
		struct dst_entry *ndst;
		struct flowi6 fl6;
		u32 rt6i_flags;
 
		memset(&fl6, 0, sizeof(fl6));
		fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
		fl6.daddr = dst->sin6.sin6_addr;
		fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
		fl6.flowi6_mark = skb->mark;
		fl6.flowi6_proto = IPPROTO_UDP;
 
#ifdef HAVE_IPV6_DST_LOOKUP_NET
		if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) {
#else
#ifdef HAVE_IPV6_STUB
		if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
#else
		ndst = ip6_route_output(vxlan->net, sk, &fl6);
		if (ndst->error) {
#endif
#endif
			netdev_dbg(dev, "no route to %pI6\n",
				   &dst->sin6.sin6_addr);
			dev->stats.tx_carrier_errors++;
			goto tx_error;
		}
 
		if (ndst->dev == dev) {
			netdev_dbg(dev, "circular route to %pI6\n",
				   &dst->sin6.sin6_addr);
			dst_release(ndst);
			dev->stats.collisions++;
			goto tx_error;
		}
 
		/* Bypass encapsulation if the destination is local */
		rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
		if (rt6i_flags & RTF_LOCAL &&
		    !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
			struct vxlan_dev *dst_vxlan;
 
			dst_release(ndst);
			dst_vxlan = vxlan_find_vni(vxlan->net, vni,
						   dst->sa.sa_family, dst_port,
						   vxlan->flags);
			if (!dst_vxlan)
				goto tx_error;
			WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
				  dev->name);
			goto tx_error;
		}
 
		ttl = ttl ? : ip6_dst_hoplimit(ndst);
		err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
				      0, ttl, src_port, dst_port, htonl(vni << 8), md,
				      !net_eq(vxlan->net, dev_net(vxlan->dev)),
				      flags);
#endif
	}
 
	return;
 
drop:
	dev->stats.tx_dropped++;
	goto tx_free;
 
rt_tx_error:
	ip_rt_put(rt);
tx_error:
	dev->stats.tx_errors++;
tx_free:
	dev_kfree_skb(skb);
}
4、vxlan_xmit_skb函數
static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
			  __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
			  __be16 src_port, __be16 dst_port, __be32 vni,
			  struct vxlan_metadata *md, bool xnet, u32 vxflags)
{
	struct vxlanhdr *vxh;
	int min_headroom;
	int err;
	bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM);		//默認爲false
	int type = 0;
 
	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
	    skb->ip_summed == CHECKSUM_PARTIAL) {		//默認不進此分支
		int csum_start = skb_checksum_start_offset(skb);
 
		if (csum_start <= VXLAN_MAX_REMCSUM_START &&
		    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
		    (skb->csum_offset == offsetof(struct udphdr, check) ||
		     skb->csum_offset == offsetof(struct tcphdr, check))) {
			udp_sum = false;
			type |= SKB_GSO_TUNNEL_REMCSUM;
 
			if (!SKB_GSO_TUNNEL_REMCSUM) {
				kfree_skb(skb);
				return -EOPNOTSUPP;
			}
		}
	}
 
	min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len	//vxlan外層的大小
			+ VXLAN_HLEN + sizeof(struct iphdr)
			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
	/* Need space for new headers (invalidates iph ptr) */
	err = skb_cow_head(skb, min_headroom);			//擴展線性區長度
	if (unlikely(err)) {
		kfree_skb(skb);
		return err;
	}
 
	skb = vlan_hwaccel_push_inside(skb);			//vlan信息添加到payload中
	if (WARN_ON(!skb))
		return -ENOMEM;
 
	skb = udp_tunnel_handle_offloads(skb, udp_sum, type, true);	//設置inner相關的head,設置encapsulation
	if (IS_ERR(skb))
		return PTR_ERR(skb);
 
	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));	//添加vxlan頭
	vxh->vx_flags = htonl(VXLAN_HF_VNI);
	vxh->vx_vni = vni;						//設置vni值
 
	if (type & SKB_GSO_TUNNEL_REMCSUM) {				//默認不進此分支
		u16 hdrlen = sizeof(struct vxlanhdr);
		u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
			   VXLAN_RCO_SHIFT;
 
		if (skb->csum_offset == offsetof(struct udphdr, check))
			data |= VXLAN_RCO_UDP;
 
		vxh->vx_vni |= htonl(data);
		vxh->vx_flags |= htonl(VXLAN_HF_RCO);
 
		if (!skb_is_gso(skb)) {
			skb->ip_summed = CHECKSUM_NONE;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
			skb->encapsulation = 0;
#endif
		}
	}
	if (vxflags & VXLAN_F_GBP)
		vxlan_build_gbp_hdr(vxh, vxflags, md);
 
	ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));	//設置內部協議
 
	return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,		//封裝傳輸層,發送報文
				   ttl, df, src_port, dst_port, xnet,
				   !(vxflags & VXLAN_F_UDP_CSUM));
}
5、udp_tunnel_xmit_skb函數
int rpl_udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk,
			    struct sk_buff *skb, __be32 src, __be32 dst,
			    __u8 tos, __u8 ttl, __be16 df, __be16 src_port,
			    __be16 dst_port, bool xnet, bool nocheck)		//內核小於3.18,使用OVS實現
{
	struct udphdr *uh;
 
	__skb_push(skb, sizeof(*uh));		//封裝UDP頭
	skb_reset_transport_header(skb);	//設置UDP header指針
	uh = udp_hdr(skb);
 
	uh->dest = dst_port;
	uh->source = src_port;
	uh->len = htons(skb->len);
 
	udp_set_csum(nocheck, skb, src, dst, skb->len);	//計算UDP csum
 
	return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP,	//IP層封裝,發送報文	
			     tos, ttl, df, xnet);
}
6、iptunnel_xmit函數
int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
                      __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl,
                      __be16 df, bool xnet)		//內核小於3.18,使用OVS實現
{
	int pkt_len = skb->len;
	struct iphdr *iph;
	int err;
 
	skb_scrub_packet(skb, xnet);
 
	skb_clear_hash(skb);
	skb_dst_set(skb, &rt_dst(rt));
 
#if 0
	/* Do not clear ovs_skb_cb.  It will be done in gso code. */
	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
#endif
 
	/* Push down and install the IP header. */
	__skb_push(skb, sizeof(struct iphdr));		//添加IP頭
	skb_reset_network_header(skb);			//設置ip header指針
 
	iph = ip_hdr(skb);
 
	iph->version	=	4;
	iph->ihl	=	sizeof(struct iphdr) >> 2;
	iph->frag_off	=	df;
	iph->protocol	=	proto;
	iph->tos	=	tos;
	iph->daddr	=	dst;
	iph->saddr	=	src;
	iph->ttl	=	ttl;
 
#ifdef HAVE_IP_SELECT_IDENT_USING_DST_ENTRY
	__ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1);
#elif defined(HAVE_IP_SELECT_IDENT_USING_NET)
	__ip_select_ident(dev_net(rt->dst.dev), iph,
			  skb_shinfo(skb)->gso_segs ?: 1);	//設置IP header的ID值,此爲內核的方法;
#else
	__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
#endif
 
	err = ip_local_out(skb);			//發送報文
	if (unlikely(net_xmit_eval(err)))
		pkt_len = 0;
	return pkt_len;
}
7、ip_local_out函數
int rpl_ip_local_out(struct sk_buff *skb)	//內核小於3.18,使用OVS實現
{
	int ret = NETDEV_TX_OK;
	int id = -1;
 
	if (!OVS_GSO_CB(skb)->fix_segment)	//如果fix_segment爲空,則直接發送報文,不進行GSO分段;默認會進行GSO分段
		return output_ip(skb);
 
	if (skb_is_gso(skb)) {
		struct iphdr *iph;
 
		iph = ip_hdr(skb);
		id = ntohs(iph->id);
		skb = tnl_skb_gso_segment(skb, 0, false);	//報文GSO分段
		if (!skb || IS_ERR(skb))
			return 0;
	}  else if (skb->ip_summed == CHECKSUM_PARTIAL) {
		int err;
 
		err = skb_checksum_help(skb);
		if (unlikely(err))
			return 0;
	}
 
	while (skb) {
		struct sk_buff *next_skb = skb->next;
		struct iphdr *iph;
 
		skb->next = NULL;
 
		iph = ip_hdr(skb);
		if (id >= 0)
			iph->id = htons(id++);
 
		ret = output_ip(skb);		//發送分段後的報文
		skb = next_skb;
	}
	return ret;
}
8、output_ip函數
static int output_ip(struct sk_buff *skb)
{
	int ret = NETDEV_TX_OK;
	int err;
 
	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 
#undef ip_local_out
	err = ip_local_out(skb);		//調用linux內核的接口
	if (unlikely(net_xmit_eval(err)))
		ret = err;
 
	return ret;
}

原文鏈接:https://blog.csdn.net/one_clouder/article/details/52600516

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章