慢啓動階段的擁塞窗口增長函數tcp_slow_start如下,擁塞窗口以報文數量表示,參數acked表示當前ACK報文確認的數據包數量。如果擁塞窗口增加acked數量之後小於慢啓動閾值ssthresh,使用二者相加結果作爲新的擁塞窗口值。內核沒有使用RFC3465中定義的ABC(Appropriate Byte Counting)算法,其最初設計的是針對按照字節表示擁塞窗口的系統。內核中規定了對數據報文的確認必須是對整個報文的確認,也可抵禦ACK Division攻擊。
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
return acked;
}
如果當snd_cwnd增加acked值之後,大於慢啓動閾值ssthresh時,將擁塞窗口CWND設置爲ssthresh的值,以上函數tcp_slow_start將返回進入擁塞避免階段接收到的ACK報文數量,交由擁塞算法處理。
ACK確認數據報文數量統計
如下函數tcp_clean_rtx_queue,遍歷重傳隊列,計算當前ACK報文確認的數據報文數量(acked_pcount),SlowStart使用此數量決定擁塞窗口,不同於協議中規定的ACK報文數量,所以,對於Stretch-ACKs,內核當其爲多個背靠背的ACK報文。
static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
u32 prior_snd_una, struct tcp_sacktag_state *sack)
{
for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
const u32 start_seq = scb->seq;
u8 sacked = scb->sacked;
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) == 1 || !after(tp->snd_una, scb->seq))
break;
acked_pcount = tcp_tso_acked(sk, skb);
if (!acked_pcount) break;
fully_acked = false;
} else {
acked_pcount = tcp_skb_pcount(skb);
}
if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
} else if (tcp_is_sack(tp)) {
tp->delivered += acked_pcount;
...
}
if (!fully_acked)
break;
在如下的SACK處理函數tcp_sacktag_one中,如果當前數據報文沒有被SACK確認過,此爲第一次被SACK確認,將確認報文數量增加到套接口變量delivered中。
static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq, int dup_sack, int pcount, u64 xmit_time)
{
...
if (!(sacked & TCPCB_SACKED_ACKED)) {
...
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
tp->delivered += pcount; /* Out-of-order packets delivered */
另外,對於Reno,每個接收到的重複ACK,認爲是對端接收了一個數據報文,相應的增加delivered計數。
static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
{
if (num_dupack) {
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
s32 delivered;
tp->sacked_out += num_dupack;
tcp_check_reno_reordering(sk, 0);
delivered = tp->sacked_out - prior_sacked;
if (delivered > 0)
tp->delivered += delivered;
如果隨後的亂序報文到達接收端,填補了接收隊列中的空洞,接收端將發送正常的確認ACK,此操作有可能減少了剩餘的重複ACK的發送,這裏,將這些剩餘重複ACK增加到delivered變量中,假設收到了它們。變量delivered至少將增加1。
static void tcp_remove_reno_sacks(struct sock *sk, int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
tp->delivered += max_t(int, acked - tp->sacked_out, 1);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
tp->sacked_out -= acked - 1;
}
計算確認報文數量
函數tcp_ack的最後使用tcp_newly_delivered計算S/ACK報文最新確認的報文數量.
static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
{
...
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
if (flag & FLAG_ECE) {
tp->delivered_ce += delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
}
return delivered;
將新確認報文數量傳遞到tcp_cong_control函數中。
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
u32 delivered = tp->delivered;
delivered = tcp_newly_delivered(sk, delivered, flag);
...
tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
函數tcp_cong_control最終將調用擁塞算法的函數指針cong_avoid。
static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
int flag, const struct rate_sample *rs)
{
...
if (tcp_in_cwnd_reduction(sk)) {
...
} else if (tcp_may_raise_cwnd(sk, flag)) {
/* Advance cwnd if state allows */
tcp_cong_avoid(sk, ack, acked_sacked);
}
tcp_update_pacing_rate(sk);
}
以內核默認的Cubic擁塞算法爲例,bictcp_cong_avoid將會調用開始時介紹的tcp_slow_start函數,傳入新確認的報文數量acked。
static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
if (!tcp_is_cwnd_limited(sk))
return;
if (tcp_in_slow_start(tp)) {
if (hystart && after(ack, ca->end_seq))
bictcp_hystart_reset(sk);
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
}
bictcp_update(ca, tp->snd_cwnd, acked);
tcp_cong_avoid_ai(tp, ca->cnt, acked);
內核版本 5.0