[cpufreq governor] schedutil governor解析

1.schedutil governor相關的結構體說明

struct sugov_policy {  
    struct cpufreq_policy *policy;  /*cpu freq的policy*/
  
    struct sugov_tunables *tunables;  /*tunable結構體,根據用戶需求改變*/
    struct list_head tunables_hook;/*tunable結構體元素鏈表*/  
  
    raw_spinlock_t update_lock;  /* For shared policies */
    /*下面四個時間參數,第一個是上次頻率變化的時間,後面三個是頻率變化的顆粒度*/  
    u64 last_freq_update_time;  
    s64 min_rate_limit_ns;  
    s64 up_rate_delay_ns;  
    s64 down_rate_delay_ns;  
    /*選擇的next freq,cached freq是保存在cache的頻率*/
    unsigned int next_freq;  
    unsigned int cached_raw_freq;  
    /*slack定時器,針對idle cpu的*/
    struct timer_list slack_timer;  
    /* The next fields are only needed if fast switch cannot be used. */  
    /*下面四個work相關最後調用的路徑一樣的*/
    struct irq_work irq_work;  
    struct kthread_work work;  
    struct mutex work_lock;  
    struct kthread_worker worker;  
    /*governor thread*/
    struct task_struct *thread;  
    /*是否在頻率調節過程中,頻率調節完畢清標誌位*/
    bool work_in_progress;  
    /*頻率限制改變會置這個標誌位,並在頻率update的時候,清這個標誌位*/
      bool need_freq_update;  
};  
 /*每個cpu都存在一個這樣的結構體,如果頻率是shared的,則調節人一個cpu的頻率會同時影響
   其他cpu,一般policy都是一樣的。
*/
struct sugov_cpu {  
    struct update_util_data update_util;  
   /*每個cpu都是同一個sugov_policy,也是同一個cpufreq_policy*/
    struct sugov_policy *sg_policy;    
    unsigned int cpu;  /*關聯的cpu id*/
    /*是否處於iowait狀態,iowait_boost頻率及其boost最高頻率*/
    bool iowait_boost_pending;  
    unsigned int iowait_boost;  
    unsigned int iowait_boost_max;  
    u64 last_update;  /*cpu util,max最後update時間*/
  
    /* The fields below are only needed when sharing a policy. */  
    unsigned long util;  
    unsigned long max;  
    unsigned int flags;  
  
    /* The field below is for single-CPU policies only. */  
#ifdef CONFIG_NO_HZ_COMMON  
    unsigned long saved_idle_calls;  
#endif  
};  
/*tunable使用,即用戶空間可調的*/
struct sugov_tunables {  
    struct gov_attr_set attr_set;/*sys接口屬性*/  
    unsigned int up_rate_limit_us;  /*頻率上升的時間間隔限制*/
    unsigned int down_rate_limit_us;/*頻率下降的時間間隔限制*/  
    unsigned int timer_slack_val_us;  /*cpuidle期間,啓動timer修改
      idlecpuidle的頻率*/
    int freq_margin;  /*頻率餘量,可以修改,區分big/little core*/
};  

2.schedutil governor如何調節cpu頻率

static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)  
{  
    struct rq *rq = rq_of(cfs_rq);  
  
    if (&rq->cfs == cfs_rq) {  
        /* 
         * There are a few boundary cases this might miss but it should 
         * get called often enough that that should (hopefully) not be 
         * a real problem -- added to that it only calls on the local 
         * CPU, so if we enqueue remotely we'll miss an update, but 
         * the next tick/schedule should update. 
         * 
         * It will not get called when we go idle, because the idle 
         * thread is a different class (!fair), nor will the utilization 
         * number include things like RT tasks. 
         * 
         * As is, the util number is not freq-invariant (we'd have to 
         * implement arch_scale_freq_capacity() for that). 
         * 
         * See cpu_util(). 
         */  
        cpufreq_update_util(rq, 0);  
    }  
}  

繼續

#ifdef CONFIG_CPU_FREQ  
DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);  
  
/** 
 * cpufreq_update_util - Take a note about CPU utilization changes. 
 * @rq: Runqueue to carry out the update for. 
 * @flags: Update reason flags. 
 * 
 * This function is called by the scheduler on the CPU whose utilization is 
 * being updated. 
 * 
 * It can only be called from RCU-sched read-side critical sections. 
 * 
 * The way cpufreq is currently arranged requires it to evaluate the CPU 
 * performance state (frequency/voltage) on a regular basis to prevent it from 
 * being stuck in a completely inadequate performance level for too long. 
 * That is not guaranteed to happen if the updates are only triggered from CFS, 
 * though, because they may not be coming in if RT or deadline tasks are active 
 * all the time (or there are RT and DL tasks only). 
 * 
 * As a workaround for that issue, this function is called by the RT and DL 
 * sched classes to trigger extra cpufreq updates to prevent it from stalling, 
 * but that really is a band-aid.  Going forward it should be replaced with 
 * solutions targeted more specifically at RT and DL tasks. 
 */  
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)  
{  
        struct update_util_data *data;  
  
    data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,  
                          cpu_of(rq)));  
    if (data)  
        data->func(data, rq_clock(rq), flags);  
}  
#else  
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}  
#endif /* CONFIG_CPU_FREQ */  

關鍵點是struct update_util_data這個結構體,僅僅是一個callback函數:

#ifdef CONFIG_CPU_FREQ  
struct update_util_data {  
    void (*func)(struct update_util_data *data, u64 time, unsigned int flags);  
};  
  
void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,  
                       void (*func)(struct update_util_data *data, u64 time,  
                                    unsigned int flags));  
void cpufreq_remove_update_util_hook(int cpu);  
#endif /* CONFIG_CPU_FREQ */  

接下來看下這個結構體與函數cpufreq_add_update_util_hook的關係是什麼:

DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);  
  
/** 
 * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. 
 * @cpu: The CPU to set the pointer for. 
 * @data: New pointer value. 
 * @func: Callback function to set for the CPU. 
 * 
 * Set and publish the update_util_data pointer for the given CPU. 
 * 
 * The update_util_data pointer of @cpu is set to @data and the callback 
 * function pointer in the target struct update_util_data is set to @func. 
 * That function will be called by cpufreq_update_util() from RCU-sched 
 * read-side critical sections, so it must not sleep.  @data will always be 
 * passed to it as the first argument which allows the function to get to the 
 * target update_util_data structure and its container. 
 * 
 * The update_util_data pointer of @cpu must be NULL when this function is 
 * called or it will WARN() and return with no effect. 
 */  
void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,  
            void (*func)(struct update_util_data *data, u64 time,  
                     unsigned int flags))  
{  
    if (WARN_ON(!data || !func))  
        return;  
  
    if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))  
        return;  
  
    data->func = func;  
    rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);  
}  

可以看到結構體update_util_data的callback函數指向了函數cpufreq_add_update_util_hook鉤子函數的形參:

void (*func)(struct update_util_data *data, u64 time,  
                     unsigned int flags)  

那麼這個函數在哪裏賦值呢?
我們看到在kernel/sched/cpufreq_schedutil.c文件,就是最新的cpu調節頻率的governor,不在是原先的interactive或者ondemand governor了。
作爲頻率調節的governor編寫流程與其他governor類型,先註冊名字爲schedutil governor:

#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL  
static  
#endif  
struct cpufreq_governor cpufreq_gov_schedutil = {  
    .name = "schedutil",  
    .governor = cpufreq_schedutil_cb,  
    .owner = THIS_MODULE,  
};  
  
static int __init sugov_register(void)  
{  
    return cpufreq_register_governor(&cpufreq_gov_schedutil);  
}  
fs_initcall(sugov_register);  

之後之後,governor開始走governor的callback函數cpufreq_schedutil_cb,

static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,  
                unsigned int event)  
{  
    switch(event) {  
    case CPUFREQ_GOV_POLICY_INIT:  
        return sugov_init(policy);  
    case CPUFREQ_GOV_POLICY_EXIT:  
        return sugov_exit(policy);  
    case CPUFREQ_GOV_START:  
        return sugov_start(policy);  
    case CPUFREQ_GOV_STOP:  
        return sugov_stop(policy);  
    case CPUFREQ_GOV_LIMITS:  
        return sugov_limits(policy);  
    default:  
        BUG();  
    }  
}  

開始執行init,然後執行start,根據event類型來執行。系統剛剛起來執行init和start動作,init是一些參數的初始化,而start纔是真正的governor開啓work了。

static int sugov_start(struct cpufreq_policy *policy)  
{  
    struct sugov_policy *sg_policy = policy->governor_data;  
    unsigned int cpu;  
  
    sg_policy->up_rate_delay_ns =  
        sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;  
    sg_policy->down_rate_delay_ns =  
        sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;  
    update_min_rate_limit_us(sg_policy);  
    sg_policy->last_freq_update_time = 0;  
    sg_policy->next_freq = UINT_MAX;  
    sg_policy->work_in_progress = false;  
    sg_policy->need_freq_update = false;  
    sg_policy->cached_raw_freq = UINT_MAX;  
  
    for_each_cpu(cpu, policy->cpus) {  
        struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);  
  
        memset(sg_cpu, 0, sizeof(*sg_cpu));  
        sg_cpu->cpu = cpu;  
        sg_cpu->sg_policy = sg_policy;  
        sg_cpu->flags = SCHED_CPUFREQ_DL;  
        sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;  
                /*OK,真正的struct update_util_data的元素的callback函數現真身了。*/  
        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,  
                         policy_is_shared(policy) ?  
                            sugov_update_shared :  
                            sugov_update_single);  
    }  
    return 0;  
}  
/*這個函數肯定返回true*/  
static inline bool policy_is_shared(struct cpufreq_policy *policy)  
{  
    return cpumask_weight(policy->cpus) > 1;  
}  

3.sugov_upodate_shared函數怎麼計算得到next_freq
可以看到這個函數的實現code如下:

static void sugov_update_shared(struct update_util_data *hook, u64 time,  
                unsigned int flags)  
{  
    struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);  
    struct sugov_policy *sg_policy = sg_cpu->sg_policy;  
    unsigned long util, max;  
    unsigned int next_f;  
  
    sugov_get_util(&util, &max, time, sg_cpu->cpu);  
  
    raw_spin_lock(&sg_policy->update_lock);  
  
    sg_cpu->util = util;  
    sg_cpu->max = max;  
    sg_cpu->flags = flags;  
  
    sugov_set_iowait_boost(sg_cpu, time, flags);  
    sg_cpu->last_update = time;  
  
    if (sugov_should_update_freq(sg_policy, time)) {  
        if (flags & SCHED_CPUFREQ_DL)  
            next_f = sg_policy->policy->cpuinfo.max_freq;  
        else  
            next_f = sugov_next_freq_shared(sg_cpu, time);  
  
        sugov_update_commit(sg_policy, time, next_f);  
    }  
  
    raw_spin_unlock(&sg_policy->update_lock);  
}  

分別來講解各個重要的函數
3.1 sugov_get_util(&util, &max, time, sg_cpu->cpu)怎麼獲取util/max的數值的。
函數實現如下:

static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time, int cpu)  
{  
    struct rq *rq = cpu_rq(cpu);  
    unsigned long max_cap, rt;  
    s64 delta;  
    /*不同cluster max_cap不同,我們平臺上,cluster0:782,cluster1:1024*/
    max_cap = arch_scale_cpu_capacity(NULL, cpu);  
  
    sched_avg_update(rq);  
    delta = time - rq->age_stamp;  
    if (unlikely(delta < 0))  
        delta = 0;  
    rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);  
    rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;  
  
    *util = boosted_cpu_util(cpu);  
    if (likely(use_pelt()))  
        *util = *util + rt;  
  
    *util = min(*util, max_cap);  
    *max = max_cap;  
}  

sched_avg_update(rq),是一個update sched avg負載使用的:

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;  
static inline u64 sched_avg_period(void)  
{  
    return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;  
}  
void sched_avg_update(struct rq *rq)  
{       /*500ms一次update sched avg*/  
    s64 period = sched_avg_period();  
        /*age_stamp是當前cpu rq的啓動時間,有兩個目的: 
        * 1. 衰減rt負載,即每個period,衰減一半,也叫老化週期 
        * 2. 將age_stamp的啓動窗口累加到接近rq_clock的窗口,目的是每次僅僅計算
        * 本period內的load 
         */  
    while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {  
        /* 
         * Inline assembly required to prevent the compiler 
         * optimising this loop into a divmod call. 
         * See __iter_div_u64_rem() for another example of this. 
         */  
        asm("" : "+rm" (rq->age_stamp));  
        rq->age_stamp += period;  
        rq->rt_avg /= 2;  
    }  
}  

下面這段代碼的意思是,計算一個週期內的rt負載並歸一化爲capacity數值:

delta = time - rq->age_stamp;  
if (unlikely(delta < 0))  
    delta = 0;  
rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);  
rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;  

boosted_cpu_util(cpu)怎麼得到util的,對於函數schedtune_cpu_margin的實現以後在仔細check(看這篇文章:https://blog.csdn.net/wukongmingjing/article/details/81739394),本文不講解。

unsigned long  
boosted_cpu_util(int cpu)  
{  
    unsigned long util = cpu_util_freq(cpu);  
    /*仔細check怎麼計算的*/
    long margin = schedtune_cpu_margin(util, cpu);  
  
    trace_sched_boost_cpu(cpu, util, margin);  
  
    return util + margin;  
}  
  
static inline unsigned long cpu_util_freq(int cpu)  
{  
    unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;  
/*各個cluster的max_capacity*/  
    unsigned long capacity = capacity_orig_of(cpu);  
/*按照walt 在各個窗口累加的runnable time/walt_ravg_window歸一化
 *load作爲cpu的util數值 
* util範圍在0~capacity之間。 util從walt獲取。
*/  
#ifdef CONFIG_SCHED_WALT  
    if (!walt_disabled && sysctl_sched_use_walt_cpu_util)  
        util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,  
                 walt_ravg_window >> SCHED_LOAD_SHIFT);  
#endif  
    return (util >= capacity) ? capacity : util;  
}  

最後得到util和max數值。由於使用WALT來計算cpu util,所以util = util(普通進程) + rt(實時進程)。最後util = min(util,max_cap),max=max_cap;計算完畢。max就是各個cluster的每個core的capacity,是一個固定數值,可能在thermal起作用的情況下會變小,這個需要仔細check下。

3.2 sugov_set_iowait_boost(sg_cpu, time, flags)怎麼設置iowait_boost數值。

  • 繼續執行sugov_update_shared函數,更新sugov_cpu結構體元素;
  • 根據flags數值:如果flags爲2,則是iowait boost情況,並且有一個iowait_boost_pending標誌位判斷當前是否已經是iowait狀態。如果已經是則直接return,否則根據iowait_boost是否有數值來設定iowait_boost的頻率數值。
  • 如果flags爲其他數值,並且iowait_boost存在數值,如果計算load的間隔超過一個tickless時間,則判斷是idle狀態,將iowait_boost和pending標誌位清零。等待下次計算週期在查看iowait狀態。
  • flags爲0,是沒有iowait的普通進程。
#define SCHED_CPUFREQ_RT        (1U << 0)  /*sched_class rt*/
#define SCHED_CPUFREQ_DL        (1U << 1)  /*sched_class */
#define SCHED_CPUFREQ_IOWAIT    (1U << 2)  /*sched_class fair && task->in_iowait!=0*/

3.3 sugov_should_update_freq(sg_policy, time)是否需要進行頻率update,判定若干個標誌位

  • dvfs_possible_from_any_cpu,即每個cpu可以單獨調節電壓並傳遞給其他cpu一起調節,默認爲true
  • fast_switch_enabled,快速頻率切換是否enable,默認false
  • work_in_progress:是否正在調節頻率,調節頻率之前置爲true,調節頻率之後置爲false,默認false
  • need_freq_update,默認false,只有在governor limit階段置爲true。
  • 最後判定rq_clock-last_freq_update_time的數值與min_rate_limit_ns比較得出是否需要update frequency。也就是頻率調節的最小間隔,小於此間隔不予調節。
static void update_min_rate_limit_us(struct sugov_policy *sg_policy)  
{  
    mutex_lock(&min_rate_lock);  
        /*min(500,1000),unit:us,也就是
           min(up_rate_limit_us,down_rate_limit_us)*/  
    sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,  
                       sg_policy->down_rate_delay_ns);  
    mutex_unlock(&min_rate_lock);  
}  

如果3.3函數返回true,則執行3.4/3.5,否則直接返回,不做頻率調整。

3.4 flags不同,如何選擇next_f,即下一個cpu frequency

  • flags==SCHED_CPUFREQ_DL,next_f = cpuinfo.max_freq
  • 其他flags走下面的,對所有cpu,根據sugov_cpu的util,max,iowait_boost,iowait_boost_max數值選擇所有cpu裏面的max*util最大的一對。每個cpu都有一個util,max,iowait_boost,iowait_boost_max=cpuinfo.max_freq,具體怎麼計算的看下code一目瞭然。比較簡單。在函數sugov_next_freq_shared裏面實現的。

3.4.1 在函數sugov_next_freq_shared裏面會遍歷所有的cpu,遍歷規則如下:

  • 在sugov_update_shared函數一開始,我們就獲取了當前cpu的util和max;
  • 每次遍歷一個cpu,比較(j_util *max > j_max *util),則util=j_util,max=j_max,目的挑選最大的。max一般都是固定數值,還是選擇cpu最大的util作爲調節頻率的依據,有點像ondemand governor,採集cpuloading,也是選擇比較各個cpuloading最大的作爲調節頻率的依據。
  • 這是cpu 的util和max的選擇,還需要根據iowait_boost和iowait_boost_max來確認最終選擇的util和max的數值。iowait boost與正常的util是兩個獨立的分支,需要互相參考挑選最大數值作爲最後的調節頻率的依據。

3.4.2 最後會根據util,max選擇next_f,具體實現在get_next_freq(sg_policy, util, max)

static unsigned int get_next_freq(struct sugov_policy *sg_policy,  
                  unsigned long util, unsigned long max)  
{  
    struct cpufreq_policy *policy = sg_policy->policy; 
    /*freq爲max_freq*/ 
    unsigned int freq = arch_scale_freq_invariant() ?  
                policy->cpuinfo.max_freq : policy->cur;  
    /*freq_margin是一開始就設定好的,區分big/little core,根據min_cap_cpu_mask*/
    int freq_margin = sg_policy->tunables->freq_margin;  
    /*對最小cluster的util進行調整,變大util數值,capa_margin=1138*/
    if (cpumask_test_cpu(policy->cpu, &min_cap_cpu_mask))  
        util = util * capacity_margin / SCHED_CAPACITY_SCALE;  
    /*根據設定的margin來決定next freq*/
    if (freq_margin > -100 && freq_margin < 100) {  
       **/*這條語句有一個bug,就是當freq_margin設置爲負值的時候,會設置失敗,
       fixed method: freq_margin = ((int)freq * freq_margin) / 100; */**
        freq_margin = (freq * freq_margin) / 100;  
        freq = ((int)freq + freq_margin) * util / max;  
    } else  
        freq = (freq + (freq >> 2)) * util / max;  /*1.25 freq*/
  
    if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)  
        return sg_policy->next_freq;  
    sg_policy->cached_raw_freq = freq;  
    return cpufreq_driver_resolve_freq(policy, freq);  /*選擇target_freq*/
}  

cached_raw_freq是保存的上次頻率值,如果一致的話就直接調整,不用再次選擇target_freq

3.5 sugov_update_commit(sg_policy, time, next_f)觸發變頻需求

  • sugov_up_down_rate_limit這個函數用來作爲頻率調整的判斷依據,比如是否符合升頻的時間限制,降頻的時間限制。
  • 根據選擇的next freq數值來修訂slack_timer是否執行
  • 如果選擇的next freq==sg_policy->next_freq頻率不做調整
  • 更新sg_policy->next_freq=next_freq,sg_policy->last_freq_update_time=time
  • 最後設置work_in_process標誌位爲true,同時執行worker裏面函數,執行sugov_irq_work---->sugov_work—> __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,CPUFREQ_RELATION_L);基本上頻率調節結束了。

4.kernel在什麼時候觸發governor去做頻率的調整

以前我們知道interactive/ondemand governor都自帶timer去主動收集cpu loading來做決策是否需要頻率的調整,但是從schedutil governor看,並沒有看到什麼時候主動去計算負載,然後做頻率的調整。
從第一章,看到,集中點都在這個函數上:cpufreq_update_util,下面是系統調用的地方

kernel/sched/fair.c:3163:       cpufreq_update_util(rq, 0);  
kernel/sched/fair.c:4847:       cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);  
kernel/sched/rt.c:1007: cpufreq_update_util(rq, SCHED_CPUFREQ_RT);  
kernel/sched/deadline.c:759:    cpufreq_update_util(rq, SCHED_CPUFREQ_DL);

可以看到flags參數分類三類sched_class,RT(flags=1),DL(flags=2),FAIR(iowait(flags=4) or not iowait(flags=0))
目的是在什麼實際調用cpufreq_update_util函數:
這裏寫圖片描述

對於sched class怎麼去調用,從何處去調用,後面在研究。如有錯誤請及時的告知,謝謝。
最後,對於現存的cpufreq governor全部分析完畢:

  1. powersave
  2. performance
  3. userspace
  4. ondemand
  5. conservative
  6. interactive
  7. schedutil
  8. sched(schedfreq) https://blog.csdn.net/wukongmingjing/article/details/81949693
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章