[cpufreq governor] schedutil governor解析

1.schedutil governor相關的結構體說明

struct sugov_policy {  
    struct cpufreq_policy *policy;  /*cpu freq的policy*/
  
    struct sugov_tunables *tunables;  /*tunable結構體，根據用戶需求改變*/
    struct list_head tunables_hook;/*tunable結構體元素鏈表*/  
  
    raw_spinlock_t update_lock;  /* For shared policies */
    /*下面四個時間參數，第一個是上次頻率變化的時間，後面三個是頻率變化的顆粒度*/  
    u64 last_freq_update_time;  
    s64 min_rate_limit_ns;  
    s64 up_rate_delay_ns;  
    s64 down_rate_delay_ns;  
    /*選擇的next freq，cached freq是保存在cache的頻率*/
    unsigned int next_freq;  
    unsigned int cached_raw_freq;  
    /*slack定時器，針對idle cpu的*/
    struct timer_list slack_timer;  
    /* The next fields are only needed if fast switch cannot be used. */  
    /*下面四個work相關最後調用的路徑一樣的*/
    struct irq_work irq_work;  
    struct kthread_work work;  
    struct mutex work_lock;  
    struct kthread_worker worker;  
    /*governor thread*/
    struct task_struct *thread;  
    /*是否在頻率調節過程中，頻率調節完畢清標誌位*/
    bool work_in_progress;  
    /*頻率限制改變會置這個標誌位，並在頻率update的時候，清這個標誌位*/
      bool need_freq_update;  
};  
 /*每個cpu都存在一個這樣的結構體，如果頻率是shared的，則調節人一個cpu的頻率會同時影響
   其他cpu，一般policy都是一樣的。
*/
struct sugov_cpu {  
    struct update_util_data update_util;  
   /*每個cpu都是同一個sugov_policy，也是同一個cpufreq_policy*/
    struct sugov_policy *sg_policy;    
    unsigned int cpu;  /*關聯的cpu id*/
    /*是否處於iowait狀態，iowait_boost頻率及其boost最高頻率*/
    bool iowait_boost_pending;  
    unsigned int iowait_boost;  
    unsigned int iowait_boost_max;  
    u64 last_update;  /*cpu util，max最後update時間*/
  
    /* The fields below are only needed when sharing a policy. */  
    unsigned long util;  
    unsigned long max;  
    unsigned int flags;  
  
    /* The field below is for single-CPU policies only. */  
#ifdef CONFIG_NO_HZ_COMMON  
    unsigned long saved_idle_calls;  
#endif  
};  
/*tunable使用，即用戶空間可調的*/
struct sugov_tunables {  
    struct gov_attr_set attr_set;/*sys接口屬性*/  
    unsigned int up_rate_limit_us;  /*頻率上升的時間間隔限制*/
    unsigned int down_rate_limit_us;/*頻率下降的時間間隔限制*/  
    unsigned int timer_slack_val_us;  /*cpuidle期間，啓動timer修改
      idlecpuidle的頻率*/
    int freq_margin;  /*頻率餘量，可以修改，區分big/little core*/
};

2.schedutil governor如何調節cpu頻率

static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)  
{  
    struct rq *rq = rq_of(cfs_rq);  
  
    if (&rq->cfs == cfs_rq) {  
        /* 
         * There are a few boundary cases this might miss but it should 
         * get called often enough that that should (hopefully) not be 
         * a real problem -- added to that it only calls on the local 
         * CPU, so if we enqueue remotely we'll miss an update, but 
         * the next tick/schedule should update. 
         * 
         * It will not get called when we go idle, because the idle 
         * thread is a different class (!fair), nor will the utilization 
         * number include things like RT tasks. 
         * 
         * As is, the util number is not freq-invariant (we'd have to 
         * implement arch_scale_freq_capacity() for that). 
         * 
         * See cpu_util(). 
         */  
        cpufreq_update_util(rq, 0);  
    }  
}

繼續

#ifdef CONFIG_CPU_FREQ  
DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);  
  
/** 
 * cpufreq_update_util - Take a note about CPU utilization changes. 
 * @rq: Runqueue to carry out the update for. 
 * @flags: Update reason flags. 
 * 
 * This function is called by the scheduler on the CPU whose utilization is 
 * being updated. 
 * 
 * It can only be called from RCU-sched read-side critical sections. 
 * 
 * The way cpufreq is currently arranged requires it to evaluate the CPU 
 * performance state (frequency/voltage) on a regular basis to prevent it from 
 * being stuck in a completely inadequate performance level for too long. 
 * That is not guaranteed to happen if the updates are only triggered from CFS, 
 * though, because they may not be coming in if RT or deadline tasks are active 
 * all the time (or there are RT and DL tasks only). 
 * 
 * As a workaround for that issue, this function is called by the RT and DL 
 * sched classes to trigger extra cpufreq updates to prevent it from stalling, 
 * but that really is a band-aid.  Going forward it should be replaced with 
 * solutions targeted more specifically at RT and DL tasks. 
 */  
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)  
{  
        struct update_util_data *data;  
  
    data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,  
                          cpu_of(rq)));  
    if (data)  
        data->func(data, rq_clock(rq), flags);  
}  
#else  
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}  
#endif /* CONFIG_CPU_FREQ */

關鍵點是struct update_util_data這個結構體，僅僅是一個callback函數：

#ifdef CONFIG_CPU_FREQ  
struct update_util_data {  
    void (*func)(struct update_util_data *data, u64 time, unsigned int flags);  
};  
  
void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,  
                       void (*func)(struct update_util_data *data, u64 time,  
                                    unsigned int flags));  
void cpufreq_remove_update_util_hook(int cpu);  
#endif /* CONFIG_CPU_FREQ */

接下來看下這個結構體與函數cpufreq_add_update_util_hook的關係是什麼：

DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);  
  
/** 
 * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. 
 * @cpu: The CPU to set the pointer for. 
 * @data: New pointer value. 
 * @func: Callback function to set for the CPU. 
 * 
 * Set and publish the update_util_data pointer for the given CPU. 
 * 
 * The update_util_data pointer of @cpu is set to @data and the callback 
 * function pointer in the target struct update_util_data is set to @func. 
 * That function will be called by cpufreq_update_util() from RCU-sched 
 * read-side critical sections, so it must not sleep.  @data will always be 
 * passed to it as the first argument which allows the function to get to the 
 * target update_util_data structure and its container. 
 * 
 * The update_util_data pointer of @cpu must be NULL when this function is 
 * called or it will WARN() and return with no effect. 
 */  
void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,  
            void (*func)(struct update_util_data *data, u64 time,  
                     unsigned int flags))  
{  
    if (WARN_ON(!data || !func))  
        return;  
  
    if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))  
        return;  
  
    data->func = func;  
    rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);  
}

可以看到結構體update_util_data的callback函數指向了函數cpufreq_add_update_util_hook鉤子函數的形參：

void (*func)(struct update_util_data *data, u64 time,  
                     unsigned int flags)

那麼這個函數在哪裏賦值呢？
我們看到在kernel/sched/cpufreq_schedutil.c文件，就是最新的cpu調節頻率的governor，不在是原先的interactive或者ondemand governor了。
作爲頻率調節的governor編寫流程與其他governor類型，先註冊名字爲schedutil governor：

#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL  
static  
#endif  
struct cpufreq_governor cpufreq_gov_schedutil = {  
    .name = "schedutil",  
    .governor = cpufreq_schedutil_cb,  
    .owner = THIS_MODULE,  
};  
  
static int __init sugov_register(void)  
{  
    return cpufreq_register_governor(&cpufreq_gov_schedutil);  
}  
fs_initcall(sugov_register);

之後之後，governor開始走governor的callback函數cpufreq_schedutil_cb，

static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,  
                unsigned int event)  
{  
    switch(event) {  
    case CPUFREQ_GOV_POLICY_INIT:  
        return sugov_init(policy);  
    case CPUFREQ_GOV_POLICY_EXIT:  
        return sugov_exit(policy);  
    case CPUFREQ_GOV_START:  
        return sugov_start(policy);  
    case CPUFREQ_GOV_STOP:  
        return sugov_stop(policy);  
    case CPUFREQ_GOV_LIMITS:  
        return sugov_limits(policy);  
    default:  
        BUG();  
    }  
}

開始執行init，然後執行start，根據event類型來執行。系統剛剛起來執行init和start動作，init是一些參數的初始化，而start纔是真正的governor開啓work了。

static int sugov_start(struct cpufreq_policy *policy)  
{  
    struct sugov_policy *sg_policy = policy->governor_data;  
    unsigned int cpu;  
  
    sg_policy->up_rate_delay_ns =  
        sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;  
    sg_policy->down_rate_delay_ns =  
        sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;  
    update_min_rate_limit_us(sg_policy);  
    sg_policy->last_freq_update_time = 0;  
    sg_policy->next_freq = UINT_MAX;  
    sg_policy->work_in_progress = false;  
    sg_policy->need_freq_update = false;  
    sg_policy->cached_raw_freq = UINT_MAX;  
  
    for_each_cpu(cpu, policy->cpus) {  
        struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);  
  
        memset(sg_cpu, 0, sizeof(*sg_cpu));  
        sg_cpu->cpu = cpu;  
        sg_cpu->sg_policy = sg_policy;  
        sg_cpu->flags = SCHED_CPUFREQ_DL;  
        sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;  
                /*OK，真正的struct update_util_data的元素的callback函數現真身了。*/  
        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,  
                         policy_is_shared(policy) ?  
                            sugov_update_shared :  
                            sugov_update_single);  
    }  
    return 0;  
}  
/*這個函數肯定返回true*/  
static inline bool policy_is_shared(struct cpufreq_policy *policy)  
{  
    return cpumask_weight(policy->cpus) > 1;  
}

3.sugov_upodate_shared函數怎麼計算得到next_freq
可以看到這個函數的實現code如下：

static void sugov_update_shared(struct update_util_data *hook, u64 time,  
                unsigned int flags)  
{  
    struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);  
    struct sugov_policy *sg_policy = sg_cpu->sg_policy;  
    unsigned long util, max;  
    unsigned int next_f;  
  
    sugov_get_util(&util, &max, time, sg_cpu->cpu);  
  
    raw_spin_lock(&sg_policy->update_lock);  
  
    sg_cpu->util = util;  
    sg_cpu->max = max;  
    sg_cpu->flags = flags;  
  
    sugov_set_iowait_boost(sg_cpu, time, flags);  
    sg_cpu->last_update = time;  
  
    if (sugov_should_update_freq(sg_policy, time)) {  
        if (flags & SCHED_CPUFREQ_DL)  
            next_f = sg_policy->policy->cpuinfo.max_freq;  
        else  
            next_f = sugov_next_freq_shared(sg_cpu, time);  
  
        sugov_update_commit(sg_policy, time, next_f);  
    }  
  
    raw_spin_unlock(&sg_policy->update_lock);  
}

分別來講解各個重要的函數
3.1 sugov_get_util(&util, &max, time, sg_cpu->cpu)怎麼獲取util/max的數值的。
函數實現如下：

static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time, int cpu)  
{  
    struct rq *rq = cpu_rq(cpu);  
    unsigned long max_cap, rt;  
    s64 delta;  
    /*不同cluster max_cap不同，我們平臺上，cluster0:782，cluster1:1024*/
    max_cap = arch_scale_cpu_capacity(NULL, cpu);  
  
    sched_avg_update(rq);  
    delta = time - rq->age_stamp;  
    if (unlikely(delta < 0))  
        delta = 0;  
    rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);  
    rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;  
  
    *util = boosted_cpu_util(cpu);  
    if (likely(use_pelt()))  
        *util = *util + rt;  
  
    *util = min(*util, max_cap);  
    *max = max_cap;  
}

sched_avg_update(rq)，是一個update sched avg負載使用的：

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;  
static inline u64 sched_avg_period(void)  
{  
    return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;  
}  
void sched_avg_update(struct rq *rq)  
{       /*500ms一次update sched avg*/  
    s64 period = sched_avg_period();  
        /*age_stamp是當前cpu rq的啓動時間，有兩個目的： 
        * 1. 衰減rt負載，即每個period，衰減一半,也叫老化週期 
        * 2. 將age_stamp的啓動窗口累加到接近rq_clock的窗口，目的是每次僅僅計算
        * 本period內的load 
         */  
    while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {  
        /* 
         * Inline assembly required to prevent the compiler 
         * optimising this loop into a divmod call. 
         * See __iter_div_u64_rem() for another example of this. 
         */  
        asm("" : "+rm" (rq->age_stamp));  
        rq->age_stamp += period;  
        rq->rt_avg /= 2;  
    }  
}

下面這段代碼的意思是，計算一個週期內的rt負載並歸一化爲capacity數值：

delta = time - rq->age_stamp;  
if (unlikely(delta < 0))  
    delta = 0;  
rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);  
rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;

boosted_cpu_util(cpu)怎麼得到util的，對於函數schedtune_cpu_margin的實現以後在仔細check（看這篇文章：https://blog.csdn.net/wukongmingjing/article/details/81739394），本文不講解。

unsigned long  
boosted_cpu_util(int cpu)  
{  
    unsigned long util = cpu_util_freq(cpu);  
    /*仔細check怎麼計算的*/
    long margin = schedtune_cpu_margin(util, cpu);  
  
    trace_sched_boost_cpu(cpu, util, margin);  
  
    return util + margin;  
}  
  
static inline unsigned long cpu_util_freq(int cpu)  
{  
    unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;  
/*各個cluster的max_capacity*/  
    unsigned long capacity = capacity_orig_of(cpu);  
/*按照walt 在各個窗口累加的runnable time/walt_ravg_window歸一化
 *load作爲cpu的util數值 
* util範圍在0～capacity之間。 util從walt獲取。
*/  
#ifdef CONFIG_SCHED_WALT  
    if (!walt_disabled && sysctl_sched_use_walt_cpu_util)  
        util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,  
                 walt_ravg_window >> SCHED_LOAD_SHIFT);  
#endif  
    return (util >= capacity) ? capacity : util;  
}

最後得到util和max數值。由於使用WALT來計算cpu util，所以util = util(普通進程) + rt(實時進程)。最後util = min(util,max_cap),max=max_cap；計算完畢。max就是各個cluster的每個core的capacity，是一個固定數值，可能在thermal起作用的情況下會變小，這個需要仔細check下。

3.2 sugov_set_iowait_boost(sg_cpu, time, flags)怎麼設置iowait_boost數值。

繼續執行sugov_update_shared函數，更新sugov_cpu結構體元素；
根據flags數值：如果flags爲2，則是iowait boost情況，並且有一個iowait_boost_pending標誌位判斷當前是否已經是iowait狀態。如果已經是則直接return，否則根據iowait_boost是否有數值來設定iowait_boost的頻率數值。
如果flags爲其他數值，並且iowait_boost存在數值，如果計算load的間隔超過一個tickless時間，則判斷是idle狀態，將iowait_boost和pending標誌位清零。等待下次計算週期在查看iowait狀態。
flags爲0，是沒有iowait的普通進程。

#define SCHED_CPUFREQ_RT        (1U << 0)  /*sched_class rt*/
#define SCHED_CPUFREQ_DL        (1U << 1)  /*sched_class */
#define SCHED_CPUFREQ_IOWAIT    (1U << 2)  /*sched_class fair && task->in_iowait!=0*/

3.3 sugov_should_update_freq(sg_policy, time)是否需要進行頻率update，判定若干個標誌位

dvfs_possible_from_any_cpu，即每個cpu可以單獨調節電壓並傳遞給其他cpu一起調節，默認爲true
fast_switch_enabled，快速頻率切換是否enable，默認false
work_in_progress：是否正在調節頻率，調節頻率之前置爲true，調節頻率之後置爲false，默認false
need_freq_update，默認false，只有在governor limit階段置爲true。
最後判定rq_clock-last_freq_update_time的數值與min_rate_limit_ns比較得出是否需要update frequency。也就是頻率調節的最小間隔，小於此間隔不予調節。

static void update_min_rate_limit_us(struct sugov_policy *sg_policy)  
{  
    mutex_lock(&min_rate_lock);  
        /*min(500,1000),unit:us,也就是
           min(up_rate_limit_us,down_rate_limit_us)*/  
    sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,  
                       sg_policy->down_rate_delay_ns);  
    mutex_unlock(&min_rate_lock);  
}

如果3.3函數返回true，則執行3.4/3.5，否則直接返回，不做頻率調整。

3.4 flags不同，如何選擇next_f，即下一個cpu frequency

flags==SCHED_CPUFREQ_DL，next_f = cpuinfo.max_freq
其他flags走下面的，對所有cpu，根據sugov_cpu的util，max，iowait_boost，iowait_boost_max數值選擇所有cpu裏面的max*util最大的一對。每個cpu都有一個util，max，iowait_boost,iowait_boost_max=cpuinfo.max_freq，具體怎麼計算的看下code一目瞭然。比較簡單。在函數sugov_next_freq_shared裏面實現的。

3.4.1 在函數sugov_next_freq_shared裏面會遍歷所有的cpu，遍歷規則如下：

在sugov_update_shared函數一開始，我們就獲取了當前cpu的util和max；
每次遍歷一個cpu，比較(j_util *max > j_max *util),則util=j_util,max=j_max，目的挑選最大的。max一般都是固定數值，還是選擇cpu最大的util作爲調節頻率的依據，有點像ondemand governor，採集cpuloading，也是選擇比較各個cpuloading最大的作爲調節頻率的依據。
這是cpu 的util和max的選擇，還需要根據iowait_boost和iowait_boost_max來確認最終選擇的util和max的數值。iowait boost與正常的util是兩個獨立的分支，需要互相參考挑選最大數值作爲最後的調節頻率的依據。

3.4.2 最後會根據util,max選擇next_f，具體實現在get_next_freq(sg_policy, util, max)

static unsigned int get_next_freq(struct sugov_policy *sg_policy,  
                  unsigned long util, unsigned long max)  
{  
    struct cpufreq_policy *policy = sg_policy->policy; 
    /*freq爲max_freq*/ 
    unsigned int freq = arch_scale_freq_invariant() ?  
                policy->cpuinfo.max_freq : policy->cur;  
    /*freq_margin是一開始就設定好的，區分big/little core,根據min_cap_cpu_mask*/
    int freq_margin = sg_policy->tunables->freq_margin;  
    /*對最小cluster的util進行調整，變大util數值，capa_margin=1138*/
    if (cpumask_test_cpu(policy->cpu, &min_cap_cpu_mask))  
        util = util * capacity_margin / SCHED_CAPACITY_SCALE;  
    /*根據設定的margin來決定next freq*/
    if (freq_margin > -100 && freq_margin < 100) {  
       **/*這條語句有一個bug,就是當freq_margin設置爲負值的時候,會設置失敗,
       fixed method: freq_margin = ((int)freq * freq_margin) / 100; */**
        freq_margin = (freq * freq_margin) / 100;  
        freq = ((int)freq + freq_margin) * util / max;  
    } else  
        freq = (freq + (freq >> 2)) * util / max;  /*1.25 freq*/
  
    if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)  
        return sg_policy->next_freq;  
    sg_policy->cached_raw_freq = freq;  
    return cpufreq_driver_resolve_freq(policy, freq);  /*選擇target_freq*/
}

cached_raw_freq是保存的上次頻率值，如果一致的話就直接調整，不用再次選擇target_freq

3.5 sugov_update_commit(sg_policy, time, next_f)觸發變頻需求

sugov_up_down_rate_limit這個函數用來作爲頻率調整的判斷依據，比如是否符合升頻的時間限制，降頻的時間限制。
根據選擇的next freq數值來修訂slack_timer是否執行
如果選擇的next freq==sg_policy->next_freq頻率不做調整
更新sg_policy->next_freq=next_freq，sg_policy->last_freq_update_time=time
最後設置work_in_process標誌位爲true，同時執行worker裏面函數，執行sugov_irq_work---->sugov_work—> __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,CPUFREQ_RELATION_L);基本上頻率調節結束了。

4.kernel在什麼時候觸發governor去做頻率的調整

以前我們知道interactive/ondemand governor都自帶timer去主動收集cpu loading來做決策是否需要頻率的調整，但是從schedutil governor看，並沒有看到什麼時候主動去計算負載，然後做頻率的調整。
從第一章，看到，集中點都在這個函數上：cpufreq_update_util，下面是系統調用的地方

kernel/sched/fair.c:3163:       cpufreq_update_util(rq, 0);  
kernel/sched/fair.c:4847:       cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);  
kernel/sched/rt.c:1007: cpufreq_update_util(rq, SCHED_CPUFREQ_RT);  
kernel/sched/deadline.c:759:    cpufreq_update_util(rq, SCHED_CPUFREQ_DL);

可以看到flags參數分類三類sched_class，RT(flags=1)，DL(flags=2)，FAIR(iowait(flags=4) or not iowait(flags=0))
目的是在什麼實際調用cpufreq_update_util函數：

對於sched class怎麼去調用，從何處去調用，後面在研究。如有錯誤請及時的告知，謝謝。
最後，對於現存的cpufreq governor全部分析完畢：

powersave
performance
userspace
ondemand
conservative
interactive
schedutil
sched(schedfreq) https://blog.csdn.net/wukongmingjing/article/details/81949693

[cpufreq governor] schedutil governor解析

《Python進階》學習筆記

Leetcode 3161. 物塊放置查詢

一個docker容器暴露多個端口

leetcode 60 排列序列

微服務實踐之使用 Visual Studio 2022 調試Dapr 應用程序

wpf附加屬性理解 WPF附加屬性

Analysis and Solution for Cpuidle Power Nightmare

[power]二. Dynamic voltage and frequency scaling(DVFS)簡單概述

Kernel space lock contention配置及其使用

[Python解析systrace.html]chrome打開systrace分析，圖形顯示時間點與文本時間點一一對應，方便debug使用

pr_emerg耗時，影響性能原理排查

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結