Linux CFS 進程調度算法

Linux主要實現了兩大類調度算法，CFS(完全公平調度算法）和實時調度算法。宏SCHED_NOMAL和SCHED_BATCH主要用於CFS調度，而SCHED_FIFO和SCHED_RR主要用於實時調度。這幾個宏的定義可以在include/linux/sched.h中找到。文件kernel/sched.c包含了內核調度器及相關係統調用的實現。調度的核心函數爲sched.c中的schedule()，schedule函數封裝了內核調度的框架。細節實現上調用具體的調度算法類中的函數實現，如kernel/sched_fair.c或kernel/sched_rt.c中的實現。
1、時鐘tick中斷的處理

在CFS中，當產生時鐘tick中斷時，sched.c中scheduler_tick()函數會被時鐘中斷（定時器timer的代碼）直接調用，我們調用它則是在禁用中斷時。注意在fork的代碼中，當修改父進程的時間片時，也會導致sched_tick的調用。sched_tick函數首先更新調度信息，然後調整當前進程在紅黑樹中的位置。調整完成後如果發現當前進程不再是最左邊的葉子，就標記need_resched標誌，中斷返回時就會調用scheduler()完成進程切換，否則當前進程繼續佔用CPU。注意這與以前的調度器不同，以前是tick中斷導致時間片遞減，當時間片被用完時才觸發優先級調整並重新調度。sched_tick函數的代碼如下：

[cpp]view
plaincopy

void scheduler_tick(void)  

{  

    int cpu = smp_processor_id();  

    struct rq *rq = cpu_rq(cpu);  

    struct task_struct *curr = rq->curr;  

    sched_clock_tick();  

    spin_lock(&rq->lock);  

    update_rq_clock(rq);  

    update_cpu_load(rq);  

    curr->sched_class->task_tick(rq, curr, 0);  

    spin_unlock(&rq->lock);  

    perf_event_task_tick(curr, cpu);  

#ifdef CONFIG_SMP  

    rq->idle_at_tick = idle_cpu(cpu);  

    trigger_load_balance(rq, cpu);  

#endif  

}

它先獲取目前CPU上的運行隊列中的當前運行進程，更新runqueue級變量clock，然後通過sched_class中的接口名task_tick，調用CFS的tick處理函數task_tick_fair()，以處理時鐘中斷。我們看kernel/sched_fair.c中的CFS算法實現。具體的調度類如下：

[cpp]view
plaincopy

static const struct sched_class fair_sched_class = {  

    .next           = &idle_sched_class,  

    .enqueue_task       = enqueue_task_fair,  

    .dequeue_task       = dequeue_task_fair,  

    .yield_task     = yield_task_fair,  

    .check_preempt_curr = check_preempt_wakeup,  

    .pick_next_task     = pick_next_task_fair,  

    .put_prev_task      = put_prev_task_fair,  

#ifdef CONFIG_SMP  

    .select_task_rq     = select_task_rq_fair,  

    .load_balance       = load_balance_fair,  

    .move_one_task      = move_one_task_fair,  

    .rq_online      = rq_online_fair,  

    .rq_offline     = rq_offline_fair,  

    .task_waking        = task_waking_fair,  

#endif  

    .set_curr_task          = set_curr_task_fair,  

    .task_tick      = task_tick_fair,  

    .task_fork      = task_fork_fair,  

    .prio_changed       = prio_changed_fair,  

    .switched_to        = switched_to_fair,  

    .get_rr_interval    = get_rr_interval_fair,  

#ifdef CONFIG_FAIR_GROUP_SCHED  

    .task_move_group    = task_move_group_fair,  

#endif  

};

task_tick_fair函數用於輪詢調度類的中一個進程。實現如下：

[cpp]view
plaincopy

static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)  

{  

    struct cfs_rq *cfs_rq;  

    struct sched_entity *se = &curr->se;  

    for_each_sched_entity(se) {  /* 考慮了組調度 */  

        cfs_rq = cfs_rq_of(se);  

        entity_tick(cfs_rq, se, queued);  

    }  

}

該函數獲取各層的調度實體，對每個調度實體獲取CFS運行隊列，調用entity_tick進程進行處理。kernel/sched_fair.c中的函數entity_tick源代碼如下：

[cpp]view
plaincopy

static void  

entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)  

{  

    /* 

     * Update run-time statistics of the 'current'. 

     */  

    update_curr(cfs_rq);  

#ifdef CONFIG_SCHED_HRTICK  

    /* 

     * queued ticks are scheduled to match the slice, so don't bother 

     * validating it and just reschedule. 

     */  

    if (queued) {  

        resched_task(rq_of(cfs_rq)->curr);  

        return;  

    }  

    /* 

     * don't let the period tick interfere with the hrtick preemption 

     */  

    if (!sched_feat(DOUBLE_TICK) &&  

            hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))  

        return;  

#endif  

    if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))  

        check_preempt_tick(cfs_rq, curr);  

}

該函數用kernel/sched_fair.c:update_curr()更新當前進程的運行時統計信息，然後調用kernel/sched_fair.c:check_preempt_tick()，檢測是否需要重新調度，用下一個進程來搶佔當前進程。update_curr()實現記賬功能,由系統定時器週期調用，實現如下：

[cpp]view
plaincopy

static inline void  

__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,  

          unsigned long delta_exec)  

{  

    unsigned long delta_exec_weighted;  

    schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));  

    curr->sum_exec_runtime += delta_exec; /* 總運行時間更新 */  

    schedstat_add(cfs_rq, exec_clock, delta_exec); /* 更新cfs_rq的exec_clock */  

    /* 用優先級和delta_exec來計算weighted，以用於更新vruntime */  

    delta_exec_weighted = calc_delta_fair(delta_exec, curr);  

    curr->vruntime += delta_exec_weighted; /* 更新當前進程的vruntime */  

    update_min_vruntime(cfs_rq);  

}  

static void update_curr(struct cfs_rq *cfs_rq)  

{  

    struct sched_entity *curr = cfs_rq->curr;  

    u64 now = rq_of(cfs_rq)->clock_task;  /* now計時器 */  

    unsigned long delta_exec;  

    if (unlikely(!curr))  

        return;  

    /* 

     * 獲取從最後一次修改負載後當前進程所佔用的運行總時間， 

     * 即計算當前進程的執行時間 

     */  

    delta_exec = (unsigned long)(now - curr->exec_start);  

    if (!delta_exec)  /* 如果本次沒有執行過，不用重新更新了 */  

        return;  

    /* 根據當前可運行進程總數對運行時間進行加權計算 */  

    __update_curr(cfs_rq, curr, delta_exec);  

    curr->exec_start = now;  /* 將exec_start屬性置爲now */  

    if (entity_is_task(curr)) {  /* 下面爲關於組調度的 */  

        struct task_struct *curtask = task_of(curr);  

        trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);  

        cpuacct_charge(curtask, delta_exec);  

        account_group_exec_runtime(curtask, delta_exec);  

    }  

}

這裏delta_exec獲得從最後一次修改負載後當前進程所佔用的運行總時間，即計算當前進程的執行時間。然後調用__update_curr()更新進程的vruntime。更新前需要計算weighted，這由sched_fair.c:calc_delta_fair()實現，如下：

[cpp]view
plaincopy

static inline unsigned long  

calc_delta_fair(unsigned long delta, struct sched_entity *se)  

{  

    if (unlikely(se->load.weight != NICE_0_LOAD))  

        delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);  

    return delta;  

}

在calc_delta_fair中，如果進程的優先級爲0,那麼就是返回delta，如果不爲0,就要調用kernel/sched.c中的calc_delta_mine對delta值進行修正，如下：

[cpp]view
plaincopy

#if BITS_PER_LONG == 32  

# define WMULT_CONST    (~0UL)  

#else  

# define WMULT_CONST    (1UL << 32)  

#endif  

#define WMULT_SHIFT 32  

/* 

 * Shift right and round: 

 */  

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))  

/* 

 * delta *= weight / lw 

 */  

static unsigned long  

calc_delta_mine(unsigned long delta_exec, unsigned long weight,  

        struct load_weight *lw)  

{  

    u64 tmp;  

    if (!lw->inv_weight) {  

        if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))  

            lw->inv_weight = 1;  

        else  

            lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)  

                / (lw->weight+1);  

    }  

    tmp = (u64)delta_exec * weight;  

    /* 

     * Check whether we'd overflow the 64-bit multiplication: 

     */  

    if (unlikely(tmp > WMULT_CONST))  

        tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,  

            WMULT_SHIFT/2);  

    else  

        tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);  

    return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);  

}

    CFS允許每個進程運行一段時間、循環輪轉、選擇運行最少的進程作爲下一個運行進程，而不再採用分配給每個進程時間片的做法了，CFS在所有可運行進程總數基礎上計算出一個進程應該運行多久，而不是依靠nice值來計算時間片。nice值在CFS中被作爲進程獲得的處理器運行比的權重，越高的nice值（越低的優先級）進程獲得更低的處理器使用權重，這是相對默認nice值進程的進程而言的；相反，更低的nice值（越高的優先級）的進程獲得更高的處理器使用權重。
   這裏delta的計算有如下關係: delta=delta* NICE_0_LOAD/se->load。se->load值是怎麼來的呢？可以跟蹤sys_nice()，就可以發現se->load其實就是表示nice對應的load值，nice越低，值越大。據此就可以得到一個結論，在執行相同時間的條件下(delta相同)，高優先的進程計算出來的delta值會比低優先級的進程計算出來的低。應此高優先的進程就會位於rb_tree的左邊，在下次調度的時候就會優先調度。
   回到entity_tick，我們看check_preempt_tick()的實現，它用來檢測是否需要重新調度下一個進程。如下：

[cpp]view
plaincopy

static void  

check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)  

{  

    unsigned long ideal_runtime, delta_exec;  

    ideal_runtime = sched_slice(cfs_rq, curr);  

    delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;  

    if (delta_exec > ideal_runtime) {  

        resched_task(rq_of(cfs_rq)->curr);  

        /* 

         * The current task ran long enough, ensure it doesn't get 

         * re-elected due to buddy favours. 

         */  

        clear_buddies(cfs_rq, curr);  

        return;  

    }  

    /* 

     * Ensure that a task that missed wakeup preemption by a 

     * narrow margin doesn't have to wait for a full slice. 

     * This also mitigates buddy induced latencies under load. 

     */  

    if (!sched_feat(WAKEUP_PREEMPT))  

        return;  

    if (delta_exec < sysctl_sched_min_granularity)  

        return;  

    if (cfs_rq->nr_running > 1) { /* 用於組調度 */  

        struct sched_entity *se = __pick_next_entity(cfs_rq);  

        s64 delta = curr->vruntime - se->vruntime;  

        if (delta > ideal_runtime)  

            resched_task(rq_of(cfs_rq)->curr);  

    }  

}

    該函數先獲取當前進程的理想運行時間，如果當前執行時間超過理想時間，調用kernel/sched.c:resched_task()設置need_resched標誌，完成設置的函數爲resched_task()--->set_tsk_need_resched(p)，表示需要重新調度進程。
   從上面分析可以看出，通過調用鏈sched_tick()--->task_tick_fair()--->entity_tick()--->［update_curr()--->__update_curr()--->calc_delta_fair()--->calc_delta_mine()] 和 [check_preempt_tick()--->resched_task()]，最終會更新調度信息，設置need_resched調度標誌。當中斷返回時，就會調用schedule()進行搶佔式調度。
   2、CFS調度操作
   在sched_fair.c中，CFS實現了用紅黑樹對運行隊列進行管理的相關操作。

（1）進程插入enqueue_task_fair：更新調度信息，調用enqueue_entity()--->__enqueue_entity()將調度實體插入到紅黑樹中。它會在nr_running遞增之前被調用。插入時，會找到右邊的空間並進行插入，然後緩存最左邊的節點。對於組調度，會對組中的所有進程進行操作。如下：

[cpp]view
plaincopy

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  

{  

    struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;  

    struct rb_node *parent = NULL;  

    struct sched_entity *entry;  

    s64 key = entity_key(cfs_rq, se); /* key爲被插入進程的vruntime */  

    int leftmost = 1;  

    /* 

     * Find the right place in the rbtree: 

     */  

    while (*link) {  

        parent = *link;  

        entry = rb_entry(parent, struct sched_entity, run_node);  

        /* 

         * We dont care about collisions. Nodes with 

         * the same key stay together. 

         */  

        if (key < entity_key(cfs_rq, entry)) {  

            link = &parent->rb_left;  

        } else {  

            link = &parent->rb_right;  

            leftmost = 0;  

        }  

    }  

    /* 

     * Maintain a cache of leftmost tree entries (it is frequently 

     * used): 

     */  

    if (leftmost)  

        cfs_rq->rb_leftmost = &se->run_node;  

    rb_link_node(&se->run_node, parent, link);  

    rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);  

}

可見CFS的運行隊列布局是放在紅黑樹裏面的，而這顆紅黑樹的排序方式是按照運行實體的vruntime來的。vruntime的計算方式在上面已經做了分析。在前面“Linux進程管理”的幾節介紹中，我們可以看到fork()在創建子進程時最後就會調用enqueue_task_fair()，將新創建的進程插入到紅黑樹中。
（2）進程選擇pick_next_task_fair：CFS調度算法的核心是選擇具有最小vruntine的任務。運行隊列採用紅黑樹方式存放，其中節點的鍵值便是可運行進程的虛擬運行時間。CFS調度器選取待運行的下一個進程，是所有進程中vruntime最小的那個，他對應的便是在樹中最左側的葉子節點。實現選擇的函數爲 pick_next_task_fair。如下：

[cpp]view
plaincopy

static struct task_struct *pick_next_task_fair(struct rq *rq)  

{  

    struct task_struct *p;  

    struct cfs_rq *cfs_rq = &rq->cfs;  

    struct sched_entity *se;  

    if (unlikely(!cfs_rq->nr_running))  

        return NULL;  

    do {   /* 此循環爲了考慮組調度 */  

        se = pick_next_entity(cfs_rq);  

        set_next_entity(cfs_rq, se);  /* 設置爲當前運行進程 */  

        cfs_rq = group_cfs_rq(se);  

    } while (cfs_rq);  

    p = task_of(se);  

    hrtick_start_fair(rq, p);  

    return p;  

}

該函數調用pick_next_entity()--->__pick_next_entity()完成獲取下一個進程的工作，這個函數如下：

[cpp]view
plaincopy

static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)  

{  

    struct rb_node *left = cfs_rq->rb_leftmost;  

    if (!left)  

        return NULL;  

    return rb_entry(left, struct sched_entity, run_node);  

}

該函數並不會遍歷紅黑樹來找到最左葉子節點（是所有進程中vruntime最小的那個），因爲該值已經緩存在rb_leftmost字段中。它通過rb_entry函數返回這個緩存的節點進程。完成實質工作的調用爲include/linux/rbtree.h:rb_entry()--->include/linux/kernel.h:container_of()，這是一個宏定義。
（3）進程刪除dequeue_task_fair：從紅黑樹中刪除進程，並更新調度信息。它會在nr_running遞減之前被調用。完成實質工作的函數爲dequeue_entity()--->__dequeue_entity()。如下：

[cpp]view
plaincopy

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)  

{  

    if (cfs_rq->rb_leftmost == &se->run_node) {  

        struct rb_node *next_node;  

        next_node = rb_next(&se->run_node);  

        cfs_rq->rb_leftmost = next_node;  

    }  

    rb_erase(&se->run_node, &cfs_rq->tasks_timeline);  

}

該函數會刪除當前進程，並從紅黑樹中選出下一個具體最小vruntime值的節點，作爲新的最左邊節點緩存起來。

Linux CFS 進程調度算法

測試系統可供malloc（）使用的內存空間大小

利用ycsb測試redis性能

Linux CFS 進程調度算法

flashcache中應用device mapper機制

打印堆棧中的初始化信息

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結