1.schedutil governor相關的結構體說明
struct sugov_policy {
struct cpufreq_policy *policy; /*cpu freq的policy*/
struct sugov_tunables *tunables; /*tunable結構體,根據用戶需求改變*/
struct list_head tunables_hook;/*tunable結構體元素鏈表*/
raw_spinlock_t update_lock; /* For shared policies */
/*下面四個時間參數,第一個是上次頻率變化的時間,後面三個是頻率變化的顆粒度*/
u64 last_freq_update_time;
s64 min_rate_limit_ns;
s64 up_rate_delay_ns;
s64 down_rate_delay_ns;
/*選擇的next freq,cached freq是保存在cache的頻率*/
unsigned int next_freq;
unsigned int cached_raw_freq;
/*slack定時器,針對idle cpu的*/
struct timer_list slack_timer;
/* The next fields are only needed if fast switch cannot be used. */
/*下面四個work相關最後調用的路徑一樣的*/
struct irq_work irq_work;
struct kthread_work work;
struct mutex work_lock;
struct kthread_worker worker;
/*governor thread*/
struct task_struct *thread;
/*是否在頻率調節過程中,頻率調節完畢清標誌位*/
bool work_in_progress;
/*頻率限制改變會置這個標誌位,並在頻率update的時候,清這個標誌位*/
bool need_freq_update;
};
/*每個cpu都存在一個這樣的結構體,如果頻率是shared的,則調節人一個cpu的頻率會同時影響
其他cpu,一般policy都是一樣的。
*/
struct sugov_cpu {
struct update_util_data update_util;
/*每個cpu都是同一個sugov_policy,也是同一個cpufreq_policy*/
struct sugov_policy *sg_policy;
unsigned int cpu; /*關聯的cpu id*/
/*是否處於iowait狀態,iowait_boost頻率及其boost最高頻率*/
bool iowait_boost_pending;
unsigned int iowait_boost;
unsigned int iowait_boost_max;
u64 last_update; /*cpu util,max最後update時間*/
/* The fields below are only needed when sharing a policy. */
unsigned long util;
unsigned long max;
unsigned int flags;
/* The field below is for single-CPU policies only. */
#ifdef CONFIG_NO_HZ_COMMON
unsigned long saved_idle_calls;
#endif
};
/*tunable使用,即用戶空間可調的*/
struct sugov_tunables {
struct gov_attr_set attr_set;/*sys接口屬性*/
unsigned int up_rate_limit_us; /*頻率上升的時間間隔限制*/
unsigned int down_rate_limit_us;/*頻率下降的時間間隔限制*/
unsigned int timer_slack_val_us; /*cpuidle期間,啓動timer修改
idlecpuidle的頻率*/
int freq_margin; /*頻率餘量,可以修改,區分big/little core*/
};
2.schedutil governor如何調節cpu頻率
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
if (&rq->cfs == cfs_rq) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
* a real problem -- added to that it only calls on the local
* CPU, so if we enqueue remotely we'll miss an update, but
* the next tick/schedule should update.
*
* It will not get called when we go idle, because the idle
* thread is a different class (!fair), nor will the utilization
* number include things like RT tasks.
*
* As is, the util number is not freq-invariant (we'd have to
* implement arch_scale_freq_capacity() for that).
*
* See cpu_util().
*/
cpufreq_update_util(rq, 0);
}
}
繼續
#ifdef CONFIG_CPU_FREQ
DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
* cpufreq_update_util - Take a note about CPU utilization changes.
* @rq: Runqueue to carry out the update for.
* @flags: Update reason flags.
*
* This function is called by the scheduler on the CPU whose utilization is
* being updated.
*
* It can only be called from RCU-sched read-side critical sections.
*
* The way cpufreq is currently arranged requires it to evaluate the CPU
* performance state (frequency/voltage) on a regular basis to prevent it from
* being stuck in a completely inadequate performance level for too long.
* That is not guaranteed to happen if the updates are only triggered from CFS,
* though, because they may not be coming in if RT or deadline tasks are active
* all the time (or there are RT and DL tasks only).
*
* As a workaround for that issue, this function is called by the RT and DL
* sched classes to trigger extra cpufreq updates to prevent it from stalling,
* but that really is a band-aid. Going forward it should be replaced with
* solutions targeted more specifically at RT and DL tasks.
*/
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
cpu_of(rq)));
if (data)
data->func(data, rq_clock(rq), flags);
}
#else
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
關鍵點是struct update_util_data這個結構體,僅僅是一個callback函數:
#ifdef CONFIG_CPU_FREQ
struct update_util_data {
void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
};
void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
void (*func)(struct update_util_data *data, u64 time,
unsigned int flags));
void cpufreq_remove_update_util_hook(int cpu);
#endif /* CONFIG_CPU_FREQ */
接下來看下這個結構體與函數cpufreq_add_update_util_hook的關係是什麼:
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
* cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
* @cpu: The CPU to set the pointer for.
* @data: New pointer value.
* @func: Callback function to set for the CPU.
*
* Set and publish the update_util_data pointer for the given CPU.
*
* The update_util_data pointer of @cpu is set to @data and the callback
* function pointer in the target struct update_util_data is set to @func.
* That function will be called by cpufreq_update_util() from RCU-sched
* read-side critical sections, so it must not sleep. @data will always be
* passed to it as the first argument which allows the function to get to the
* target update_util_data structure and its container.
*
* The update_util_data pointer of @cpu must be NULL when this function is
* called or it will WARN() and return with no effect.
*/
void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
void (*func)(struct update_util_data *data, u64 time,
unsigned int flags))
{
if (WARN_ON(!data || !func))
return;
if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
return;
data->func = func;
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
}
可以看到結構體update_util_data的callback函數指向了函數cpufreq_add_update_util_hook鉤子函數的形參:
void (*func)(struct update_util_data *data, u64 time,
unsigned int flags)
那麼這個函數在哪裏賦值呢?
我們看到在kernel/sched/cpufreq_schedutil.c文件,就是最新的cpu調節頻率的governor,不在是原先的interactive或者ondemand governor了。
作爲頻率調節的governor編寫流程與其他governor類型,先註冊名字爲schedutil governor:
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
static
#endif
struct cpufreq_governor cpufreq_gov_schedutil = {
.name = "schedutil",
.governor = cpufreq_schedutil_cb,
.owner = THIS_MODULE,
};
static int __init sugov_register(void)
{
return cpufreq_register_governor(&cpufreq_gov_schedutil);
}
fs_initcall(sugov_register);
之後之後,governor開始走governor的callback函數cpufreq_schedutil_cb,
static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,
unsigned int event)
{
switch(event) {
case CPUFREQ_GOV_POLICY_INIT:
return sugov_init(policy);
case CPUFREQ_GOV_POLICY_EXIT:
return sugov_exit(policy);
case CPUFREQ_GOV_START:
return sugov_start(policy);
case CPUFREQ_GOV_STOP:
return sugov_stop(policy);
case CPUFREQ_GOV_LIMITS:
return sugov_limits(policy);
default:
BUG();
}
}
開始執行init,然後執行start,根據event類型來執行。系統剛剛起來執行init和start動作,init是一些參數的初始化,而start纔是真正的governor開啓work了。
static int sugov_start(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
sg_policy->up_rate_delay_ns =
sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
sg_policy->down_rate_delay_ns =
sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
update_min_rate_limit_us(sg_policy);
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = UINT_MAX;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_DL;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
/*OK,真正的struct update_util_data的元素的callback函數現真身了。*/
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
sugov_update_shared :
sugov_update_single);
}
return 0;
}
/*這個函數肯定返回true*/
static inline bool policy_is_shared(struct cpufreq_policy *policy)
{
return cpumask_weight(policy->cpus) > 1;
}
3.sugov_upodate_shared函數怎麼計算得到next_freq
可以看到這個函數的實現code如下:
static void sugov_update_shared(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long util, max;
unsigned int next_f;
sugov_get_util(&util, &max, time, sg_cpu->cpu);
raw_spin_lock(&sg_policy->update_lock);
sg_cpu->util = util;
sg_cpu->max = max;
sg_cpu->flags = flags;
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
if (flags & SCHED_CPUFREQ_DL)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
next_f = sugov_next_freq_shared(sg_cpu, time);
sugov_update_commit(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
}
分別來講解各個重要的函數
3.1 sugov_get_util(&util, &max, time, sg_cpu->cpu)怎麼獲取util/max的數值的。
函數實現如下:
static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time, int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max_cap, rt;
s64 delta;
/*不同cluster max_cap不同,我們平臺上,cluster0:782,cluster1:1024*/
max_cap = arch_scale_cpu_capacity(NULL, cpu);
sched_avg_update(rq);
delta = time - rq->age_stamp;
if (unlikely(delta < 0))
delta = 0;
rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
*util = boosted_cpu_util(cpu);
if (likely(use_pelt()))
*util = *util + rt;
*util = min(*util, max_cap);
*max = max_cap;
}
sched_avg_update(rq),是一個update sched avg負載使用的:
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
static inline u64 sched_avg_period(void)
{
return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
}
void sched_avg_update(struct rq *rq)
{ /*500ms一次update sched avg*/
s64 period = sched_avg_period();
/*age_stamp是當前cpu rq的啓動時間,有兩個目的:
* 1. 衰減rt負載,即每個period,衰減一半,也叫老化週期
* 2. 將age_stamp的啓動窗口累加到接近rq_clock的窗口,目的是每次僅僅計算
* 本period內的load
*/
while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
/*
* Inline assembly required to prevent the compiler
* optimising this loop into a divmod call.
* See __iter_div_u64_rem() for another example of this.
*/
asm("" : "+rm" (rq->age_stamp));
rq->age_stamp += period;
rq->rt_avg /= 2;
}
}
下面這段代碼的意思是,計算一個週期內的rt負載並歸一化爲capacity數值:
delta = time - rq->age_stamp;
if (unlikely(delta < 0))
delta = 0;
rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
boosted_cpu_util(cpu)怎麼得到util的,對於函數schedtune_cpu_margin的實現以後在仔細check(看這篇文章:https://blog.csdn.net/wukongmingjing/article/details/81739394),本文不講解。
unsigned long
boosted_cpu_util(int cpu)
{
unsigned long util = cpu_util_freq(cpu);
/*仔細check怎麼計算的*/
long margin = schedtune_cpu_margin(util, cpu);
trace_sched_boost_cpu(cpu, util, margin);
return util + margin;
}
static inline unsigned long cpu_util_freq(int cpu)
{
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
/*各個cluster的max_capacity*/
unsigned long capacity = capacity_orig_of(cpu);
/*按照walt 在各個窗口累加的runnable time/walt_ravg_window歸一化
*load作爲cpu的util數值
* util範圍在0~capacity之間。 util從walt獲取。
*/
#ifdef CONFIG_SCHED_WALT
if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
walt_ravg_window >> SCHED_LOAD_SHIFT);
#endif
return (util >= capacity) ? capacity : util;
}
最後得到util和max數值。由於使用WALT來計算cpu util,所以util = util(普通進程) + rt(實時進程)。最後util = min(util,max_cap),max=max_cap;計算完畢。max就是各個cluster的每個core的capacity,是一個固定數值,可能在thermal起作用的情況下會變小,這個需要仔細check下。
3.2 sugov_set_iowait_boost(sg_cpu, time, flags)怎麼設置iowait_boost數值。
- 繼續執行sugov_update_shared函數,更新sugov_cpu結構體元素;
- 根據flags數值:如果flags爲2,則是iowait boost情況,並且有一個iowait_boost_pending標誌位判斷當前是否已經是iowait狀態。如果已經是則直接return,否則根據iowait_boost是否有數值來設定iowait_boost的頻率數值。
- 如果flags爲其他數值,並且iowait_boost存在數值,如果計算load的間隔超過一個tickless時間,則判斷是idle狀態,將iowait_boost和pending標誌位清零。等待下次計算週期在查看iowait狀態。
- flags爲0,是沒有iowait的普通進程。
#define SCHED_CPUFREQ_RT (1U << 0) /*sched_class rt*/
#define SCHED_CPUFREQ_DL (1U << 1) /*sched_class */
#define SCHED_CPUFREQ_IOWAIT (1U << 2) /*sched_class fair && task->in_iowait!=0*/
3.3 sugov_should_update_freq(sg_policy, time)是否需要進行頻率update,判定若干個標誌位
- dvfs_possible_from_any_cpu,即每個cpu可以單獨調節電壓並傳遞給其他cpu一起調節,默認爲true
- fast_switch_enabled,快速頻率切換是否enable,默認false
- work_in_progress:是否正在調節頻率,調節頻率之前置爲true,調節頻率之後置爲false,默認false
- need_freq_update,默認false,只有在governor limit階段置爲true。
- 最後判定rq_clock-last_freq_update_time的數值與min_rate_limit_ns比較得出是否需要update frequency。也就是頻率調節的最小間隔,小於此間隔不予調節。
static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
{
mutex_lock(&min_rate_lock);
/*min(500,1000),unit:us,也就是
min(up_rate_limit_us,down_rate_limit_us)*/
sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
sg_policy->down_rate_delay_ns);
mutex_unlock(&min_rate_lock);
}
如果3.3函數返回true,則執行3.4/3.5,否則直接返回,不做頻率調整。
3.4 flags不同,如何選擇next_f,即下一個cpu frequency
- flags==SCHED_CPUFREQ_DL,next_f = cpuinfo.max_freq
- 其他flags走下面的,對所有cpu,根據sugov_cpu的util,max,iowait_boost,iowait_boost_max數值選擇所有cpu裏面的max*util最大的一對。每個cpu都有一個util,max,iowait_boost,iowait_boost_max=cpuinfo.max_freq,具體怎麼計算的看下code一目瞭然。比較簡單。在函數sugov_next_freq_shared裏面實現的。
3.4.1 在函數sugov_next_freq_shared裏面會遍歷所有的cpu,遍歷規則如下:
- 在sugov_update_shared函數一開始,我們就獲取了當前cpu的util和max;
- 每次遍歷一個cpu,比較(j_util *max > j_max *util),則util=j_util,max=j_max,目的挑選最大的。max一般都是固定數值,還是選擇cpu最大的util作爲調節頻率的依據,有點像ondemand governor,採集cpuloading,也是選擇比較各個cpuloading最大的作爲調節頻率的依據。
- 這是cpu 的util和max的選擇,還需要根據iowait_boost和iowait_boost_max來確認最終選擇的util和max的數值。iowait boost與正常的util是兩個獨立的分支,需要互相參考挑選最大數值作爲最後的調節頻率的依據。
3.4.2 最後會根據util,max選擇next_f,具體實現在get_next_freq(sg_policy, util, max)
static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned long util, unsigned long max)
{
struct cpufreq_policy *policy = sg_policy->policy;
/*freq爲max_freq*/
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
/*freq_margin是一開始就設定好的,區分big/little core,根據min_cap_cpu_mask*/
int freq_margin = sg_policy->tunables->freq_margin;
/*對最小cluster的util進行調整,變大util數值,capa_margin=1138*/
if (cpumask_test_cpu(policy->cpu, &min_cap_cpu_mask))
util = util * capacity_margin / SCHED_CAPACITY_SCALE;
/*根據設定的margin來決定next freq*/
if (freq_margin > -100 && freq_margin < 100) {
**/*這條語句有一個bug,就是當freq_margin設置爲負值的時候,會設置失敗,
fixed method: freq_margin = ((int)freq * freq_margin) / 100; */**
freq_margin = (freq * freq_margin) / 100;
freq = ((int)freq + freq_margin) * util / max;
} else
freq = (freq + (freq >> 2)) * util / max; /*1.25 freq*/
if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
return sg_policy->next_freq;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq); /*選擇target_freq*/
}
cached_raw_freq是保存的上次頻率值,如果一致的話就直接調整,不用再次選擇target_freq
3.5 sugov_update_commit(sg_policy, time, next_f)觸發變頻需求
- sugov_up_down_rate_limit這個函數用來作爲頻率調整的判斷依據,比如是否符合升頻的時間限制,降頻的時間限制。
- 根據選擇的next freq數值來修訂slack_timer是否執行
- 如果選擇的next freq==sg_policy->next_freq頻率不做調整
- 更新sg_policy->next_freq=next_freq,sg_policy->last_freq_update_time=time
- 最後設置work_in_process標誌位爲true,同時執行worker裏面函數,執行sugov_irq_work---->sugov_work—> __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,CPUFREQ_RELATION_L);基本上頻率調節結束了。
4.kernel在什麼時候觸發governor去做頻率的調整
以前我們知道interactive/ondemand governor都自帶timer去主動收集cpu loading來做決策是否需要頻率的調整,但是從schedutil governor看,並沒有看到什麼時候主動去計算負載,然後做頻率的調整。
從第一章,看到,集中點都在這個函數上:cpufreq_update_util,下面是系統調用的地方
kernel/sched/fair.c:3163: cpufreq_update_util(rq, 0);
kernel/sched/fair.c:4847: cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
kernel/sched/rt.c:1007: cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
kernel/sched/deadline.c:759: cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
可以看到flags參數分類三類sched_class,RT(flags=1),DL(flags=2),FAIR(iowait(flags=4) or not iowait(flags=0))
目的是在什麼實際調用cpufreq_update_util函數:
對於sched class怎麼去調用,從何處去調用,後面在研究。如有錯誤請及時的告知,謝謝。
最後,對於現存的cpufreq governor全部分析完畢:
- powersave
- performance
- userspace
- ondemand
- conservative
- interactive
- schedutil
- sched(schedfreq) https://blog.csdn.net/wukongmingjing/article/details/81949693