TCP客户端端口号选择

如下函数inet_hash_connect，如果没有指定绑定的接口，在发起连接的时候，由函数inet_sk_port_offset先选择一个端口偏移量（port_offset），函数__inet_hash_connect负责绑定端口。

int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk)
{
    u32 port_offset = 0;

    if (!inet_sk(sk)->inet_num)
        port_offset = inet_sk_port_offset(sk);
    return __inet_hash_connect(death_row, sk, port_offset,
                   __inet_check_established);
}

端口偏移量根据套接口的监听地址，目的地址和目的端口生成。

static u32 inet_sk_port_offset(const struct sock *sk)
{
    const struct inet_sock *inet = inet_sk(sk);

    return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
                      inet->inet_daddr, inet->inet_dport);
}

如下函数__inet_hash_connect，如果指定了端口号（port），直接定位到端口号对应的绑定结构体inet_bind_bucket，如果此结构体（tb）的拥有者owners链表中的首个套接口等于当前套接口，并且拥有者owners仅有一个（只有当前套接口监听此端口号），将套接口移入ehash链表（TCP客户端没有listen链表）。

否则，如果tb中有多个套接口，或者tb的拥有者链表的首位不是当前套接口，由指针函数check_established检查此端口号是否被使用，指针函数实际上为函数__inet_check_established，稍后介绍。

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
        struct sock *sk, u32 port_offset,
        int (*check_established)(struct inet_timewait_death_row *,
            struct sock *, __u16, struct inet_timewait_sock **))
{
    struct inet_hashinfo *hinfo = death_row->hashinfo;
    struct inet_timewait_sock *tw = NULL;
    struct inet_bind_hashbucket *head;
    int port = inet_sk(sk)->inet_num;
    struct net *net = sock_net(sk);
    struct inet_bind_bucket *tb;
	static u32 hint;

    if (port) {
        head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
        tb = inet_csk(sk)->icsk_bind_hash;
        spin_lock_bh(&head->lock);
        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
            inet_ehash_nolisten(sk, NULL);
            spin_unlock_bh(&head->lock);
            return 0;
        }
        spin_unlock(&head->lock);
        /* No definite answer... Walk to established hash table */
        ret = check_established(death_row, sk, port, NULL);
        local_bh_enable();
        return ret;
    }

以下是没有指定端口号，进行自动分配的情况。首先确保端口的数量值（remaining = high-low）为偶数；其次，hint值为一个静态变量，表示可能可用的端口号，其记录了上一次选择的端口号加上2的结果值。而offset为以上函数inet_sk_port_offset生成的哈希值。根据以上两个值（hint和port_offset）生成一个偏移量offset，确保其为偶数值。在此函数中，首先尝试端口范围内与low的奇偶性相同的端口号，offset为偶数，保证不会改变之后port的奇偶性（加/减偶数，奇偶性不改变）。同理，对于偶数变量remaining，其也不会改变port的奇偶性。

与此不同，在函数inet_csk_find_open_port中，优先选择与low奇偶性不同的端口号。

    l3mdev = inet_sk_bound_l3mdev(sk);

    inet_get_local_port_range(net, &low, &high);
    high++;                /* [32768, 60999] -> [32768, 61000[ */
    remaining = high - low;
    if (likely(remaining > 1))
        remaining &= ~1U;

    offset = (hint + port_offset) % remaining;
    /* In first pass we try ports of @low parity. inet_csk_get_port() does the opposite choice.
     */
    offset &= ~1U;

以下开始端口号遍历过程，根据端口号和网络命名空间定位到inet_bind_bucket结构链表，遍历其中的每个tb结构，如果已经设置了地址（fastreuse）或者端口重用（fastreuseport），表明已经有套接口监听在此端口号上，结束处理，开始遍历下一个端口号。

否则，使用指针函数check_established检查端口号是否可用，参见下节对函数__inet_check_established的介绍。

other_parity_scan:
    port = low + offset;
    for (i = 0; i < remaining; i += 2, port += 2) {
        if (unlikely(port >= high))
            port -= remaining;
        if (inet_is_local_reserved_port(net, port))
            continue;
        head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)];
        spin_lock_bh(&head->lock);

        /* Does not bother with rcv_saddr checks, because the established check is already unique enough.
         */
        inet_bind_bucket_for_each(tb, &head->chain) {
            if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && tb->port == port) {
                if (tb->fastreuse >= 0 || tb->fastreuseport >= 0)
                    goto next_port;
                WARN_ON(hlist_empty(&tb->owners));
                if (!check_established(death_row, sk, port, &tw))
                    goto ok;
                goto next_port;
            }
        }

流程走到以下部分，表明端口号对应的inet_bind_bucket结构还没有创建，端口可用。以下创建tb结构，将其成员fastreuse和fastreuseport设置为-1，表示客户端地址/端口号不可重用。

        tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net, head, port, l3mdev);
        if (!tb) {
            spin_unlock_bh(&head->lock);
            return -ENOMEM;
        }
        tb->fastreuse = -1;
        tb->fastreuseport = -1;
        goto ok;
next_port:
        spin_unlock_bh(&head->lock);
        cond_resched();
    }

如果在遍历所有的与low奇偶性相同的端口号时，没有找到可用的端口号，以下跳转回去，在端口范围内遍历所有与low的奇偶性不同的端口号。如果所有端口号已经遍历完成（offset & 1），返回地址不可用的错误码EADDRNOTAVAIL。

    offset++;
    if ((offset & 1) && remaining > 1)
        goto other_parity_scan;

    return -EADDRNOTAVAIL;

变量hint记录下一个可用的与low的奇偶性不同的端口号。最后，将当前套接口添加到连接建立哈希链表中。

ok:
    hint += i + 2;

    /* Head lock still held and bh's disabled */
    inet_bind_hash(sk, tb, port);
    if (sk_unhashed(sk)) {
        inet_sk(sk)->inet_sport = htons(port);
        inet_ehash_nolisten(sk, (struct sock *)tw);
    }
    if (tw)
        inet_twsk_bind_unhash(tw, hinfo);
    spin_unlock(&head->lock);
    if (tw)
        inet_twsk_deschedule_put(tw);
    local_bh_enable();
    return 0;

连接建立链表端口检查

连接状态套接口链表的检查函数__inet_check_established如下，使用选取的本地端口号lport，与监听地址，以及目的地址/目的端口号，计算连接建立链表的相应哈希值，定位到对应的哈希链表。

static int __inet_check_established(struct inet_timewait_death_row *death_row,
                    struct sock *sk, __u16 lport, struct inet_timewait_sock **twp)
{
    struct inet_hashinfo *hinfo = death_row->hashinfo;
    struct inet_sock *inet = inet_sk(sk);
    __be32 daddr = inet->inet_rcv_saddr;
    __be32 saddr = inet->inet_daddr;
    int dif = sk->sk_bound_dev_if;
    struct net *net = sock_net(sk);
    int sdif = l3mdev_master_ifindex_by_index(net, dif);
    INET_ADDR_COOKIE(acookie, saddr, daddr);
    const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
    unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport);
    struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);

遍历链表，如果找到一个源地址/目的地址，源端口/目的端口，绑定接口和网络命名空间与当前套接口（sk）都相同的套接口（sk2），表明当前选择的端口号已经被使用，返回错误值EADDRNOTAVAIL。但是，有一个例外情况，如果套接口sk2处于TCP_TIME_WAIT状态，使用函数twsk_unique检查是否可安全的重用其端口号，PROC文件（/proc/sys/net/ipv4/tcp_tw_reuse）控制是否可重用，默认情况下其值为0，禁止重用。如果其值为1，则可进行重用。

具体判断由函数twsk_unique完成，如果tcp_tw_reuse值为1，为安全起见，在重用之前，将修改套接口（sk）的发送序号以及timestamps接收时间戳，以抵御sk2套接口残留的报文。

    spin_lock(lock);

    sk_nulls_for_each(sk2, node, &head->chain) {
        if (sk2->sk_hash != hash)
            continue;

        if (likely(INET_MATCH(sk2, net, acookie,
                     saddr, daddr, ports, dif, sdif))) {
            if (sk2->sk_state == TCP_TIME_WAIT) {
                tw = inet_twsk(sk2);
                if (twsk_unique(sk, sk2, twp))
                    break;
            }
            goto not_unique;
        }
    }

到这一步已经确认选择的端口号可以使用，将当前套接口sk添加到连接建立链表中。如果tw有值，将tw套接口由链表中删除。

    /* Must record num and sport now. Otherwise we will see
     * in hash table socket with a funny identity.
     */
    inet->inet_num = lport;
    inet->inet_sport = htons(lport);
    sk->sk_hash = hash;
    WARN_ON(!sk_unhashed(sk));
    __sk_nulls_add_node_rcu(sk, &head->chain);
    if (tw) {
        sk_nulls_del_node_init_rcu((struct sock *)tw);
        __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
    }
    spin_unlock(lock);
    sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);

    if (twp) {
        *twp = tw;
    } else if (tw) {
        /* Silly. Should hash-dance instead... */
        inet_twsk_deschedule_put(tw);
    }
    return 0;

not_unique:
    spin_unlock(lock);
    return -EADDRNOTAVAIL;

内核版本 5.0

TCP客户端端口号选择

连接建立链表端口检查

DPDK-l3fwd示例IPv6測試

測試DPDK示例程序l3fwd

TCP-Westwood擁塞算法

TCP-Hybla擁塞算法

SACK Reneging

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結