tcp/ip 协议栈Linux源码分析二 IPv4分片报文重组分析二

继续接着上篇讲，之前我们说过，收到分片报文后首先会检查分片报文所占内存是否过大，如果超过阈值的话就要调用ip_evictor函数去释放一些旧的分片队列，关于如何释放分片队列资源上一篇已经总结完成，接下来来看下进一步的处理，即如何查找分片队列的，先看下代码：

	/* Lookup (or create) queue header */
	/* 这里根据分片五元组(源地址、目的地址、IP ID，protocol, user)去查找分片队列
	 * ip_find函数查找成功就返回对应的分片队列，查找失败就新建一个分片队列，
	 * 如果分配失败的话就返回NULL;
	 */
	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
		int ret;

		spin_lock(&qp->q.lock);

        /* 这里是分片队列排队的地方，报文的排队，重组都在这里执行，下面
         * 再来分析该函数。
         */
		ret = ip_frag_queue(qp, skb);

		spin_unlock(&qp->q.lock);

		/* 这是一个包裹函数，减少分片队列的引用计数，如果没人引用该
         * 队列就调用inet_frag_destroy释放队列所占资源。
		 */
		ipq_put(qp);
		return ret;
	}

首先是调用ip_find()函数根据报文的五元组得到一个hash值去查找hash表找到对应的分片队列，找到的话返回，找不到并且当前hash桶的深度不超过一定的值的话就新建一个队列，否则就直接返回NULL。

我们看下ip_find()具体的处理流程：

/* Find the correct entry in the "incomplete datagrams" queue for
 * this IP datagram, and create new one, if nothing is found.
 * 从哈希表中找到对应的分片队列，找不到就新建一个
 */
static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
{
	struct inet_frag_queue *q;
	struct ip4_create_arg arg;
	unsigned int hash;

    /* arg包含了分片的五元组，源地址、目的地址、协议 
     * IP ID以及user(表示调用者，可能是协议栈也可能是netfilter )
     */
	arg.iph = iph;
	arg.user = user;

    /* 先持有哈希表的读锁，防止更改 */
	read_lock(&ip4_frags.lock);

	/* 根据上述五元组到一个hash值，经典的hash函数，可以拿来自用 */
	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);

    /* 根据hash值查找hash表，这里arg的作用是对分片队列进行匹配，
     * 因为hash值相等的分片队列能有很多，在这个函数里，如果找不到
     * 的话就会去新建一个分片队列。
     */
	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
	if (IS_ERR_OR_NULL(q)) {
		inet_frag_maybe_warn_overflow(q, pr_fmt());
		return NULL;
	}
	
     /* 找到了，返回ipq分片队列指针,注意区分struct ipq 和
	 * struct inet_frag_queue的关系，两者是包含关系，前者包含后者
	 */
	return container_of(q, struct ipq, q);
}

接着看inet_frag_find 分片队列查找函数的实现：

/* 分片队列查找函数 */
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
		struct inet_frags *f, void *key, unsigned int hash)
	__releases(&f->lock)
{
	struct inet_frag_queue *q;
	struct hlist_node *n;
	int depth = 0;

    /* 遍历hash表，即ip4_frags->hash[hash],然后调用match回调函数
     * 去和报文的五元组进行匹配，找到的话就增加该队列的引用计数并返回其指针，
     * 找不到的话增加hash桶的深度，继续查找下一个。
     * ip4_frags 注册的match 回调函数是ip4_frag_match，在ip_fragment.c文件里
     * 该函数很简单，就是去比较五元组是否完全一样。
     */
	hlist_for_each_entry(q, n, &f->hash[hash], list) {
		if (q->net == nf && f->match(q, key)) {
			atomic_inc(&q->refcnt);
			read_unlock(&f->lock);
			return q;
		}
		depth++;
	}
	read_unlock(&f->lock);

    /* 还是没找到，如果hash桶深不超过限值的话就调用inet_frag_create
     * 创建一个新的分片队列，超出的话直接返回错误就得了。
     * 通常收到第一个分片的时候会走到这里。
     */
	if (depth <= INETFRAGS_MAXDEPTH)
		return inet_frag_create(nf, f, key);
	else
		return ERR_PTR(-ENOBUFS);
}
EXPORT_SYMBOL(inet_frag_find);

match对调函数，处理很简单，就是对报文的五元组和分片队列进行比较;

static int ip4_frag_match(struct inet_frag_queue *q, void *a)
{
	struct ipq *qp;
	struct ip4_create_arg *arg = a;

	qp = container_of(q, struct ipq, q);
	return	qp->id == arg->iph->id &&
			qp->saddr == arg->iph->saddr &&
			qp->daddr == arg->iph->daddr &&
			qp->protocol == arg->iph->protocol &&
			qp->user == arg->user;
}

这里重点关注下 inet_frag_create 函数：

/* 创建分片队列 */
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
		struct inet_frags *f, void *arg)
{
	struct inet_frag_queue *q;

    /* 创建并初始化分片队列 */
	q = inet_frag_alloc(nf, f, arg);
	if (q == NULL)
		return NULL;

    /* 将分片队列插入到分片哈希表中和lru链表尾部 */
	return inet_frag_intern(nf, q, f, arg);
}

inet_frag_alloc就是创建一个分片队列缓存然后初始化：

static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
		struct inet_frags *f, void *arg)
{
	struct inet_frag_queue *q;

    /* qsize指的是分片队列的固定大小，等于sizeof(struct ipq) */
	q = kzalloc(f->qsize, GFP_ATOMIC);
	if (q == NULL)
		return NULL;

    /* 初始化分片队列，将五元组赋值给分片队列，
     * constructor是初始化回调函数是，ip4_frag_init(), 在ipfrag_init()里设置。
     */
	f->constructor(q, arg);

	/* 增加分片所占用的内存大小 */
	atomic_add(f->qsize, &nf->mem);

	/* 初始化该分片队列的定时器，并设置该定时器的回调处理函数 
	 * 回调处理函数是在系统初始化的时候设置的，ip4的分片定时器
	 * 回调处理函数是ip_expire(), 该定时器的主要作用是重组超时后
	 * 释放该分片队列所占资源，防止大量分片长时间占用内存，定时器
     * 的时间也是可以通过proc文件系统去配置的等。
	 */
	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
	spin_lock_init(&q->lock);

	/* 初始化引用计数为1 */
	atomic_set(&q->refcnt, 1);
	q->net = nf;

	return q;
}

上面分片队列创建完成之后，还要调用inet_frag_intern（）函数将分片队列插入到hash数组中，看下这个函数的处理：

/* 分片队列插入函数 */
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
		struct inet_frag_queue *qp_in, struct inet_frags *f,
		void *arg)
{
	struct inet_frag_queue *qp;
#ifdef CONFIG_SMP
	struct hlist_node *n;
#endif
	unsigned int hash;

    /* 因为是修改分片hash表，这里要求写锁 */
	write_lock(&f->lock);
	/*
	 * While we stayed w/o the lock other CPU could update
	 * the rnd seed, so we need to re-calculate the hash
	 * chain. Fortunatelly the qp_in can be used to get one.
	 */
	 /*
	 * hashfn函数指针在ipfrag_init()里初始化为ip4_hashfn(),
	 * 就是一个hash函数
	 */
	hash = f->hashfn(qp_in);
#ifdef CONFIG_SMP
	/* With SMP race we have to recheck hash table, because
	 * such entry could be created on other cpu, while we
	 * promoted read lock to write lock.
	 * 
	 * 在多核处理情况下有可能其它CPU也收到同一路报文然后创建 了
	 * 分片队列，如果出现这种情况就将我们新创建的分片队列释放掉，
	 * 即设置last_in标志位，然后调用inet_frag_put()做释放处理，
	 * 这时候把先创建的分片队列qp返回就好了。
	 */
	 
	hlist_for_each_entry(qp, n, &f->hash[hash], list) {
		if (qp->net == nf && f->match(qp, arg)) {
			atomic_inc(&qp->refcnt);
			write_unlock(&f->lock);
			qp_in->last_in |= INET_FRAG_COMPLETE;
			inet_frag_put(qp_in, f);
			return qp;
		}
	}
#endif
	qp = qp_in;
	/* 重新初始化分片队列超时时间 */
	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
		atomic_inc(&qp->refcnt);

	atomic_inc(&qp->refcnt);

	/* 插入到分片hash表的头部 */
	hlist_add_head(&qp->list, &f->hash[hash]);

	/* 插入到lru链表的尾部，当分片所占空用过大的时候，
	 * 内核会从lru的首部顺序释放分片队列，因为排在前面的
	 * 都是旧的分片，新的都挂在lru尾部
	 */
	list_add_tail(&qp->lru_list, &nf->lru_list);

	/* 增加分片队列个数 */
	nf->nqueues++;

	/* 插入结束，释放写锁 */
	write_unlock(&f->lock);
	return qp;
}

在多核处理情况下可能会重复创建分片队列，这时候后创建的分片队列对调用inet_frag_put() 函数进行释放，这是个静态内联函数，原型在inet_frag.h里

static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
{
    /* 当分片队列引用计数为0的时候，释放分片队列 */
	if (atomic_dec_and_test(&q->refcnt))
		inet_frag_destroy(q, f, NULL);
}

inet_frag_destroy()函数上篇博客已介绍，这里不再重复。

ip_find（）函数返回后就已经得到了该报文所对应的分片队列，这时候再调用ip_frag_queue()进行进一步处理，当然，ip_find也有可能返回失败，这个时候就只能释放该报文skb缓存。

ip_frag_queue函数主要进行分片报文的排队、重组处理，这里需要处理多种异常情况，函数比较长，今晚就先不讲了，放在下篇讲。

tcp/ip 协议栈Linux源码分析二 IPv4分片报文重组分析二

《日本蜡烛图》读书笔记 & 技术分析回测

《期货-市场技术分析》读书笔记

Python多线程编程深度探索：从入门到实战

mongodb处理json数据很好

[转帖]cpupower

35K*14 薪，入职了！这公司只要不裁员，我能一直呆下去！

Linux進程間通信五 Posix 信號量簡介與示例

Linux進程間通信四 Posix 消息隊列簡介與示例

Linux進程間通信六 Posix 共享內存簡介與示例

Linux進程間通信三 System V 信號量簡介與示例

面試淺談 c++ 的空間兩級配置器

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結