網絡設備發送超時定時器

在註冊網絡設備函數register_netdevice中將調用Qdisc初始化函數dev_init_scheduler，這裏將創建設備的watchdog定時器，超時處理函數設置爲dev_watchdog，用來監控網絡設備的發送隊列傳輸超時。

void dev_init_scheduler(struct net_device *dev)
{
    dev->qdisc = &noop_qdisc;
    netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
    if (dev_ingress_queue(dev))
        dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);

    timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
}

watchdog的開啓

內核函數__netdev_watchdog_up負責開啓watchdog定時器，如下所示，如果超時時長小於等於0，內核默認成5秒鐘。網卡驅動程序可修改此時長，例如Mellanox的mlx5的驅動將此值設置爲15秒；Intel的網卡驅動基本都是使用5秒的默認值。

另外，如果設備驅動中沒有實現超時處理函數ndo_tx_timeout，這裏並不啓動watchdog定時器。驅動中爲實現超時處理，可能是驅動並不能由導致發送超時的錯誤中進行恢復。

void __netdev_watchdog_up(struct net_device *dev)
{
    if (dev->netdev_ops->ndo_tx_timeout) {
        if (dev->watchdog_timeo <= 0)
            dev->watchdog_timeo = 5*HZ;
        if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
            dev_hold(dev);

在檢測到設備的物理鏈路UP之後，將開啓設備的watchdog定時器。

void netif_carrier_on(struct net_device *dev)
{
    if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
        if (dev->reg_state == NETREG_UNINITIALIZED)
            return;
        atomic_inc(&dev->carrier_up_count);
        linkwatch_fire_event(dev);
        if (netif_running(dev))
            __netdev_watchdog_up(dev);

或者，在檢測到網絡設備由節能狀態恢復回來，重新接入系統時，開啓watchdog定時器，參見以下函數。

void netif_device_attach(struct net_device *dev)
{
    if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) {
        netif_tx_wake_all_queues(dev);
        __netdev_watchdog_up(dev);

對於Intel的i40e網卡驅動程序，其實現了ndo_tx_timeout函數，即i40e_tx_timeout，並且，其顯式指定了網絡設備的watchdog定時器超時時間爲5秒（這裏與默認的相同）。

static const struct net_device_ops i40e_netdev_ops = {
    ...
    .ndo_tx_timeout     = i40e_tx_timeout,
	
static int i40e_config_netdev(struct i40e_vsi *vsi)
{	
    ...
    netdev->watchdog_timeo = 5 * HZ;

watchdog的關閉

如下，在函數dev_watchdog_down中，內核刪除網絡設備的watchdog定時器。

static void dev_watchdog_down(struct net_device *dev)
{
    netif_tx_lock_bh(dev);
    if (del_timer(&dev->watchdog_timer))
        dev_put(dev);
    netif_tx_unlock_bh(dev);
}

以上函數的調用位於函數dev_deactivate_many中，其的調用有兩處，一是__dev_close_many函數，即用戶shutdown網絡設備時，停止watchdog計時器。另一處是封裝函數dev_deactivate。

void dev_deactivate_many(struct list_head *head)
{   
    struct net_device *dev;
    
    list_for_each_entry(dev, head, close_list) {
        ...
        dev_watchdog_down(dev);

函數dev_deactivate在設備鏈路狀態發生變化時，參見netif_carrier_on和netif_carrier_off函數，linkwatch功能將在link事件處理函數linkwatch_do_dev中根據鏈路狀態調用設備的活動和非活動函數。如果鏈路down，調用dev_deactivate，其中會刪除設備watchdog定時器；反之，鏈路UP，將調用dev_activate函數，其中將開啓watchdog計時器。

static void linkwatch_do_dev(struct net_device *dev)
{    
    if (dev->flags & IFF_UP && netif_device_present(dev)) {
        if (netif_carrier_ok(dev))
            dev_activate(dev);
        else
            dev_deactivate(dev);

另外，以上節介紹的netif_carrier_on函數功能不同，在函數netif_carrier_off中並沒有顯示的關閉watchdog定時器，而是調用了linkwatch功能的函數linkwatch_fire_event，添加鏈路事件，最終也是由linkwatch_do_dev在事件處理時，關閉watchdog定時器。

void netif_carrier_off(struct net_device *dev)
{
    if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
        if (dev->reg_state == NETREG_UNINITIALIZED)
            return;
        atomic_inc(&dev->carrier_down_count);
        linkwatch_fire_event(dev);

watchdog超時處理

首先看一下隊列發送事件的計算，由函數txq_trans_update完成，記錄在發送隊列結構的成員trans_start中。

static inline void txq_trans_update(struct netdev_queue *txq)
{
    if (txq->xmit_lock_owner != -1)
        txq->trans_start = jiffies;
}

在內核的設備核心發送函數netdev_start_xmit中，發送成功之後，更新發送時間。

static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
                        struct netdev_queue *txq, bool more)
{
    rc = __netdev_start_xmit(ops, skb, dev, more);
    if (rc == NETDEV_TX_OK)
        txq_trans_update(txq);

超時處理函數dev_watchdog如下所示，如果設備在所有發送隊列上都使用Qdisc的noop類型，不進行處理。否則，變量每個隊列，檢查發送停止的隊列，通過比較最近一次的發送時間和當前時間的差值，如果停止時間超過設置的超時時間watchdog_timeo，即認爲此隊列發送超時。

函數dev_watchdog將打印部分出錯的隊列信息，並且調用設備驅動的超時處理函數ndo_tx_timeout。

static void dev_watchdog(struct timer_list *t)
{
    struct net_device *dev = from_timer(dev, t, watchdog_timer);

    netif_tx_lock(dev);
    if (!qdisc_tx_is_noop(dev)) {
        if (netif_device_present(dev) && netif_running(dev) && netif_carrier_ok(dev)) {

            for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq;

                txq = netdev_get_tx_queue(dev, i);
                trans_start = txq->trans_start;
                if (netif_xmit_stopped(txq) &&
                    time_after(jiffies, (trans_start + dev->watchdog_timeo))) {
                    some_queue_timedout = 1;
                    txq->trans_timeout++;
                    break;
                }
            }

            if (some_queue_timedout) {
                WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
                       dev->name, netdev_drivername(dev), i);
                dev->netdev_ops->ndo_tx_timeout(dev);
            }
            if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
                dev_hold(dev);

驅動超時處理

此處以Intel的網卡驅動i40e爲例，以下爲其超時處理函數i40e_tx_timeout，可見，其第一部分的處理與上一節函數dev_watchdog中的基本相同，這裏找出第一個超時的隊列。

static void i40e_tx_timeout(struct net_device *netdev)
{
    pf->tx_timeout_count++;

    /* find the stopped queue the same way the stack does */
    for (i = 0; i < netdev->num_tx_queues; i++) {
        struct netdev_queue *q;
        unsigned long trans_start;

        q = netdev_get_tx_queue(netdev, i);
        trans_start = q->trans_start;
        if (netif_xmit_stopped(q) &&
            time_after(jiffies, (trans_start + netdev->watchdog_timeo))) {
            hung_queue = i;
            break;
        }
    }

變量tx_timeout_last_recovery控制超時處理的時間間隔，不能小於watchdog_timeo的值，即處理完成一個隊列之後才能處理下一個隊列的超時。另外，函數中定義了恢復等級tx_timeout_recovery_level，超出20秒鐘，由等級1開始。

    if (time_after(jiffies, (pf->tx_timeout_last_recovery + HZ*20)))
        pf->tx_timeout_recovery_level = 1;  /* reset after some time */
    else if (time_before(jiffies,
              (pf->tx_timeout_last_recovery + netdev->watchdog_timeo)))
        return;   /* don't do any new action before the next timeout */

    /* don't kick off another recovery if one is already pending */
    if (test_and_set_bit(__I40E_TIMEOUT_RECOVERY_PENDING, pf->state))
        return;

    pf->tx_timeout_last_recovery = jiffies;

以下，每次超時處理，將恢復等級加一，等級越高說明問題越嚴重，需要執行的恢復操作分別爲PF、CORE和GLOBAL三個級別。

    netdev_info(netdev, "tx_timeout recovery level %d, hung_queue %d\n",
            pf->tx_timeout_recovery_level, hung_queue);

    switch (pf->tx_timeout_recovery_level) {
    case 1:
        set_bit(__I40E_PF_RESET_REQUESTED, pf->state);
        break;
    case 2:
        set_bit(__I40E_CORE_RESET_REQUESTED, pf->state);
        break;
    case 3:
        set_bit(__I40E_GLOBAL_RESET_REQUESTED, pf->state);
        break;
    default:
        netdev_err(netdev, "tx_timeout recovery unsuccessful\n");
        break;
    }

    i40e_service_event_schedule(pf);
    pf->tx_timeout_recovery_level++;

內核版本 5.0

redwingz

發佈了444 篇原創文章 · 獲贊 36 · 訪問量 18萬+

他的留言板關注

網絡設備發送超時定時器

watchdog的開啓

watchdog的關閉

watchdog超時處理

驅動超時處理

工作中用到的腳本合集

24-5-18 X

DPDK-l3fwd示例IPv6測試

測試DPDK示例程序l3fwd

TCP-Westwood擁塞算法

TCP-Hybla擁塞算法

SACK Reneging

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結