linux调度器_第三代cfs(4)_总手稿_完结篇

这是自己之前自己写的手稿，在我这里用UE打开的格式有点不一样，所以在这也许有点乱，大概还是可以看的，有兴趣的朋友可以看看。

前段时间看了那么久的调度器，感觉又忘了差不多了，还是来整理下。
1.先理理还能记下什么：
a.goto在这里很合适，可以生成最有的汇编代码；
b.虚拟时间是个很牛B的东西。
c.第一代，从第一个找到最后一个看哪个优先级高；
第二代，把优先级分成四十个等级，然后从最高的开始找，然而，进程多了以后，粒度不够细，把副教授跟正教授分到了一块；
第三代，很牛。据史料记载，从Linux2.6.23（kernel/sched.c）到Linux4.0.1都在用，而现在是4.0.1是否仍在用，待考证已考证是的（linux-4.0.1\kernel\sched\fair.c）Ingo Molnar？
d，睡眠的时候，虚拟时间不变，睡醒后为避免一直占用，所以重新定虚拟运行时间，利用红黑树结构。
e，第三代是从系统角度考虑，根据进程对系统的渴望程度来选择进程，而不是像之前的从进程角度考虑，哪个进程优先级高就选择哪个。

好吧，开始跟书跟源码理理先。
wait_runtime ?公平的理论研究
激活调度的两种方法：1，直接的，比如进程打算睡眠或出于其他原因放弃CPU；
2，周期性机制，以固定的频率运行，不时检测是否有必要进行进程切换。
内核支持不同的调度策略：
1，完全公平调度；
2，实时调度；
3，在无事可做时，调度空闲进程。
各个进程的task_sruct有几个成员与调度有关：
task_struct()
{
...
int prio, static_prio, normal_prio;
//static_prio静态优先级在进程启动时分配，可以用nice(),sched_setscheduler()修改，负责一直不变
//normal_prio是基于静态优先级和调度策略计算出的优先级。子进程会继承父进程的这个优先级
//prio是调度器考虑的优先级,（前面都是浮云）。
unsigned int rt_priority；//实时进程优先级，最低为0，最大为99。
struct list_head run_list;//循环实时调度器使用，完全公平调度器不用。表头
const struct sched_class *sched_class;//表示所属的调度器类，调度器不限于调度进程还可以调度更大的实体，如组调度。
struct sched_entity se;
unsigned int policy;
//调度策略 5种
/*
* Scheduling policies
*/
#define SCHED_NORMAL0 //主要讲此类
#define SCHED_FIFO1 //软实时，先进先出机制（fifo）
#define SCHED_RR2 //软实时，循环的机制
#define SCHED_BATCH3 // 用于非交互，CPU使用密集的批处理进程
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE5 //基本不用，重要性比较低，相对权重总是最小的

cpumask_t cpus_allowed;//位域，在多处理器上使用，用来限制进程可以在哪些CPU上运行
unsigned int time_slice;//循环实时调度器使用，完全公平调度器不用。所剩CPU时间段
...
}

调度器类
提供调度器和各个调度方法之间的关联。名字基本都叫出了作用。（不赘述，无非入队，出队等等）一些函数指针等等。
struct sched_class
{
const struct sched_class *next；
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
...
}

二解优先级
这里应该算是一个重头戏，怕太美，不忍心看，像件艺术品，像只昂贵的机械手表，从一个齿轮，一根表针，一点点拼装起来，
这个过程本身就是一种享受，但当花了很长时间组装起来后，你看着秒针跳动，齿轮旋转，擦了擦额头的汗水，油污并没有影响笑容。
然后发现唾液不由自主的分泌加快。忍不住咽了两口口水。我还没有开始分解就已经饥渴难耐了。算算时间，听会听力，明天晚上拆分linux内核优先级。

静态优先级
内核使用0-139（包含）来表示内部优先级，值越低优先级越高。0-99给实时进程用，100-139刚好映射进程40个nice值（（-20）-19）给普通进程用。
显然实时进程总是比普通进程优先级高。
//Priority of a process goes from 0..MAX_PRIO-1 使用0-139来表示优先级
//RT priority is 0..MAX_RT_PRIO-1实时进程从0..（100-1）
//This allows kernel threads to set their priority to a value higher than any user task 内核线程可以把优先级设置的比用户进程高
#define MAX_USER_RT_PRIO100
#define MAX_RT_PRIOMAX_USER_RT_PRIO

#define MAX_PRIO(MAX_RT_PRIO + 40)//100+40
#define DEFAULT_PRIO(MAX_RT_PRIO + 20)

只用静态优先级是不够的，还必须考虑动态优先级（task_struct->prio），普通优先级task_struct->normal_prio，静态优先级task_struct->static_prio。静态优先级是起点。

计算函数 p->prio = effective_prio(p);

static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);//计算普通优先级，接下函数分解
/*
* If we are RT tasks or we were boosted to RT priority,
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
if (!rt_prio(p->prio))//如果非实时进程，
return p->normal_prio;//返回普通优先级，即动态优先级=普通优先级
return p->prio;//实时进程，则（）返回动态优先级。。。估计别处会计算？
}
//计算普通优先级
static inline int normal_prio(struct task_struct *p)
{
int prio;

if (task_has_rt_policy(p))//若是实时进程
prio = MAX_RT_PRIO-1 - p->rt_priority;//100-1 - 实时进程优先级(就是这个实时进程的等级)
else
prio = __normal_prio(p);//就是返回静态优先级
return prio;
}

static inline int __normal_prio(struct task_struct *p)//为什么要额外曾经一个这样的函数，
//历史原因，在原来O（1）调度器中，计算涉及很多技巧性工作，
{ //检测提高交互式进程优先级，“惩罚”非交互，待深入，还是感谢现在调度器
return p->static_prio;
}

判断是否为实时进程的两种不同方法
static inline int rt_prio(int prio)//effective_prio()中调用，与100这个界限值比较，考虑到了后面的优先级反转情况
{
if (unlikely(prio < MAX_RT_PRIO))
return 1;
return 0;
}

static inline int task_has_rt_policy(struct task_struct *p) //normal_prio()中调用，是利用进程本源属性比较的
{
return rt_policy(p->policy);
}
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
return 1;
return 0;
}

优先级基本上是都算完了：
static_prio normal_prio prio
非实时进程 static_priostatic_prio static_prio
优先级题高的非实时 static_priostatic_prio prio不变
实时进程 static_prioMAX_RT_PRIO-1 - p->rt_priority prio不变

进程的重要性，不仅要考虑优先级，还要考虑task_struct->se.load的负荷权重

权重
struct load_weight
{
unsigned long weight, inv_weight;//另一个小插曲，内核不仅维护负荷权重本身,另一个数值用于计算被负荷权重除的结果
//，long 类型，所以1/weight无法直接
};
进程每降低一个nice值，则多获得10%的CPU时间，每升高一个nice值则放弃10%的时间。
此处说道nice值，估计仅仅在非实时进程中有用吧？

/*
* Nice levels are multiplicative, with a gentle 10% change for every
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
* nice 1, it will get ~10% less CPU time than another CPU-bound task
* that remained on nice 0.
*
* The "10% effect" is relative and cumulative: from _any_ nice level,
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
* If a task goes up by ~10% and another task goes down by ~10% then
* the relative distance between them is ~25%.)
*/
static const int prio_to_weight[40] = {
/* -20 */ 88761, 71755, 56483, 46273, 36291,
/* -15 */ 29154, 23254, 18705, 14949, 11916,
/* -10 */ 9548, 7620, 6100, 4904, 3906,
/* -5 */ 3121, 2501, 1991, 1586, 1277,
/* 0 */ 1024, 820, 655, 526, 423,
/* 5 */ 335, 272, 215, 172, 137,
/* 10 */ 110, 87, 70, 56, 45,
/* 15 */ 36, 29, 23, 18, 15,
};
/*
* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
*
* In cases where the weight does not change often, we can use the
* precalculated inverse to speed up arithmetics by turning divisions
* into multiplications:
*/
static const u32 prio_to_wmult[40] = {
/* -20 */ 48388, 59856, 76040, 92818, 118348,
/* -15 */ 147320, 184698, 229616, 287308, 360437,
/* -10 */ 449829, 563644, 704093, 875809, 1099582,
/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

//此段大意，CPU的nice值下降一级，那么将多获得10%的CPU时间，而这个10%中有一个相对(relative)的概念
//来举个例子吧，只有AB两个进程在运行，nice值原本都是0，知权值load都为1024，
//则A进程1024/（1024+1024）=50%的CPU，想象下若要拉开10%的差距，此消彼长,则A=55%，B=45%；
//若此时A的nice值不变，即权值不变，B的nice值上升一级，权值该便多少呢？这就是我们这个权值表的由来了？
//1024/(1024+B的权值) ≈ 55%；计算出来的1024/0.55 - 1024 = 837左右，可见于820相差并不大。
//而内核直接用1/（1+1.25）≈ 0.4444；取1.25这个基数，这个约等于放大就是我们上面的情况了。
//应该是从0这个nice值对应1024分别向两边扩展的

转换代码要考虑实时进程，实时进程的权重是普通进程的2倍，SCHED_IDLE进程权重总是非常小，前面也提到过

/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
* each task makes to its run queue's load is weighted according to its
* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
* scaled version of the new time slice allocation that they receive on time
* slice expiry etc.
*/
//最后一句说这对于SCHED_NORMAL（普通进程）来说，权值是一种有鳞（有层次）的新时间片分配方法。

#define WEIGHT_IDLEPRIO2
#define WMULT_IDLEPRIO(1 << 31)

static void set_load_weight(struct task_struct *p)
{
if (task_has_rt_policy(p)) {
p->se.load.weight = prio_to_weight[0] * 2;//实时进程等于普通进程的最大权值*2
p->se.load.inv_weight = prio_to_wmult[0] >> 1;//这个反转，为了不常变（does not change often），快速计算
return;
}

/*
* SCHED_IDLE tasks get minimal weight://SCHED_IDLE进程权重总是非常小
*/
if (p->policy == SCHED_IDLE) {
p->se.load.weight = WEIGHT_IDLEPRIO;
p->se.load.inv_weight = WMULT_IDLEPRIO;
return;
}

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; //普通进程的计算方法
p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
}
//每次进程被加到就绪队列时，内核会调用inc_nr_running(),这不仅确保就绪队列能跟踪记录有多少进程在运行，
//而且还将进程的权重添加到就绪队列的权重中
static void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
inc_load(rq, p);
}

核心调度器
1.周期性调度器
如果当前进程应该被重新调度，那么会在task_struct中设置TIF_NEED_RESCHED标志
2.核心调度器
__sched schedule（）{}； //__sched这个函数前缀用于可能调用schedule（）函数的函数

完全公平类调度（重要）

核心调度器知道有关完全公平调度器的所有信息
/*
* All the scheduling class methods:
*/
static const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
...
}

CFS的数据结构
/* CFS-related fields in a runqueue */
struct cfs_rq
{
struct load_weight load;
unsigned long nr_running;//计算队列上可运行进程的数目

u64 min_vruntime;//跟踪队列上所有进程的最小虚拟运行时间，可能比红黑树最左边的树节点的vruntime大

struct rb_root tasks_timeline;//用于按时间排序的红黑树中管理所有进程
...//省略一些关于组调度的信息

}
完全公平类算法依赖于虚拟时钟，但在数据结构中并没有这个变量，是因为虚拟时钟可以根据实际时钟跟负荷权重算出来。所以命名为虚拟时钟。
计算虚拟时钟的函数是update_curr()

static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;//确认就绪队列的当前执行进程
u64 now = rq_of(cfs_rq)->clock;//获取主调度器就绪队列额实际时钟值
unsigned long delta_exec;

if (unlikely(!curr))//如果就绪队列上没进程正在执行，无事可做，返回
return;

/*
* Get the amount of time the current task was running
* since the last time we changed load (this cannot
* overflow on 32 bits):
*/
delta_exec = (unsigned long)(now - curr->exec_start);// 可以理解为if的else，负责内核计算当前和上一次负荷权重变化时的时间差

__update_curr(cfs_rq, curr, delta_exec);// 更新当前进程CPU话费的物理时间和虚拟时间
curr->exec_start = now;

}

__update_curr(cfs_rq, curr, delta_exec)
__update_curr()
{
unsigned long delta_exec_weighted;
u64 vruntime;

schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));

curr->sum_exec_runtime += delta_exec;//物理时间比较好算，直解把时间差加进来就可以了
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = delta_exec;// 对于运行在nice级别0的进程来说，定义虚拟时间权重和物理时间是的等等
if (unlikely(curr->load.weight != NICE_0_LOAD)) {
delta_exec_weighted = calc_delta_fair(delta_exec_weighted,// 计算其他nice值得，小块执行权值
&curr->load);
}
curr->vruntime += delta_exec_weighted;

/*
* maintain cfs_rq->min_vruntime to be a monotonic increasing
* value tracking the leftmost vruntime in the tree.
*/
if (first_fair(cfs_rq)) {
vruntime = min_vruntime(curr->vruntime,
__pick_next_entity(cfs_rq)->vruntime);
} else
vruntime = curr->vruntime;

cfs_rq->min_vruntime =
max_vruntime(cfs_rq->min_vruntime, vruntime);//确保min_vruntime 只会增加不会减少
}

参考了《深入理解linux内核架构》
calc_delta_fair(delta_exec_weighted, &curr->load);
delta_exec
calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
{
有点绕哦；
}
delta_exec_weighted = delta_exec * NICE_0_LOAD/curr->load.weight;
//需指定越重要的进程权值越大，那么 delta_exec_weighted 就小，即虚拟运行时间curr->vruntime += delta_exec_weighted;就增加慢。
//注意这都是针对非实时进程的，以上算是证明了越重要的进程，虚拟运行时间增加的越慢，那么就越靠近左边，下次运行机会大。

//那么原始的curr->exec_start 在哪里设置呢？
delta_exec = (unsigned long)(now - curr->exec_start);
/*
* We are picking a new current task - update its stats:
*/
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* We are starting a new run period:
*/
se->exec_start = rq_of(cfs_rq)->clock;//rq_of是一个辅助函数，用去确定与CFS就绪队列相关的struct rq实例，看样子这个clock设置进程开始的时间
}
//然后delta_exec = (unsigned long)(now - curr->exec_start);
//更新一下后curr->exec_start = now;

//内核设置min_vruntime必须保证该值是单调递增的。

核心思路来了：红黑树的排序过程是根据下列键进行排序的
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return se->vruntime - cfs_rq->min_vruntime;
}
//键值较小的点，排序的位置越考左，因此会被更快地调度。
1.在进程运行时，其vruntime稳定地增加，他在红黑树中总是向右移动。
2.如果进程进入睡眠，则vruntime保持不变。因为每个队列的min_vruntime保持增加。所以睡醒后，在红黑树的位置会考左，因为键值减小了。

//周期性调度器
//电量不足的时候，可关闭
主要作用：
1.管理调度相关的统计量
2.激活负责当前进程的调度类的周期性调度方法。

完全公平调度器中不再存在所谓时间片概念，这个运行时间是变化的，跟权值，可运行进程数目都有关系
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.

void scheduler_tick(void)//整体看起来比二代里面内容少多了
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
u64 next_tick = rq->tick_timestamp + TICK_NSEC;

spin_lock(&rq->lock);
__update_rq_clock(rq);
/*
* Let rq->clock advance by at least TICK_NSEC:
*/
if (unlikely(rq->clock < next_tick))
rq->clock = next_tick;
rq->tick_timestamp = rq->clock;//更新时间戳
update_cpu_load(rq);
if (curr != rq->idle) /* FIXME: needed? */
curr->sched_class->task_tick(rq, curr);//实现方式取决于底层的调度器类。.task_tick = task_tick_fair,= task_tick_idle,task_tick_rt,
spin_unlock(&rq->lock);//先看task_tick_fair

}

//先看task_tick_fair，形式上俺负责
static void task_tick_fair(struct rq *rq, struct task_struct *curr)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se);//实际上交由本函数负责
}
}
//真正干活的
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);// 更新统计量

if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) //如果可运行进程多于1个，就会抢占，负责是什么都不干的
check_preempt_tick(cfs_rq, curr);
}
//可以抢占时候，做什么，确保没有哪个进程能够比延迟周期中确定的份额运行得更长。
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;

ideal_runtime = sched_slice(cfs_rq, curr);//计算完美时间
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime)//如果运行时间大于，之前计算出来的理想时间（即超出了延时限制）
resched_task(rq_of(cfs_rq)->curr);
}

//计算完美时间，可以随着可运行进程数目的多少来弹性变化。
/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
*/
static u64 __sched_period(unsigned long nr_running)
{
u64 period = sysctl_sched_latency;
unsigned long nr_latency = sched_nr_latency;

if (unlikely(nr_running > nr_latency)) {
period *= nr_running;
do_div(period, nr_latency);
}

return period;
}

实时调度类
实时进程与普通进程有一个根本不同之处：如果系统有一个实时进程且可运行，那么调度器总会选择它运行，出发有一个优先级更高的实时进程。
循环进程（SCHED_RR）。
先进先出（SCHED_FIFO）.

比较简单，就是选优先级比较高的运行。

前段时间看了那么久的调度器，感觉又忘了差不多了，还是来整理下。

1.先理理还能记下什么：
a.goto在这里很合适，可以生成最有的汇编代码；
b.虚拟时间是个很牛B的东西。
c.第一代，从第一个找到最后一个看哪个优先级高；
第二代，把优先级分成四十个等级，然后从最高的开始找，然而，进程多了以后，粒度不够细，把副教授跟正教授分到了一块；
第三代，很牛。据史料记载，从Linux2.6.23（kernel/sched.c）到Linux4.0.1都在用，而现在是4.0.1是否仍在用，待考证已考证是的（linux-4.0.1\kernel\sched\fair.c）Ingo Molnar？
d，睡眠的时候，虚拟时间不变，睡醒后为避免一直占用，所以重新定虚拟运行时间，利用红黑树结构。
e，第三代是从系统角度考虑，根据进程对系统的渴望程度来选择进程，而不是像之前的从进程角度考虑，哪个进程优先级高就选择哪个。

好吧，开始跟书跟源码理理先。
wait_runtime ?公平的理论研究
激活调度的两种方法：1，直接的，比如进程打算睡眠或出于其他原因放弃CPU；
2，周期性机制，以固定的频率运行，不时检测是否有必要进行进程切换。
内核支持不同的调度策略：
1，完全公平调度；
2，实时调度；
3，在无事可做时，调度空闲进程。
各个进程的task_sruct有几个成员与调度有关：
task_struct()
{
...
int prio, static_prio, normal_prio;
//static_prio静态优先级在进程启动时分配，可以用nice(),sched_setscheduler()修改，负责一直不变
//normal_prio是基于静态优先级和调度策略计算出的优先级。子进程会继承父进程的这个优先级
//prio是调度器考虑的优先级,（前面都是浮云）。
unsigned int rt_priority；//实时进程优先级，最低为0，最大为99。
struct list_head run_list;//循环实时调度器使用，完全公平调度器不用。表头
const struct sched_class *sched_class;//表示所属的调度器类，调度器不限于调度进程还可以调度更大的实体，如组调度。
struct sched_entity se;
unsigned int policy;
//调度策略 5种
/*
* Scheduling policies
*/
#define SCHED_NORMAL0 //主要讲此类
#define SCHED_FIFO1 //软实时，先进先出机制（fifo）
#define SCHED_RR2 //软实时，循环的机制
#define SCHED_BATCH3 // 用于非交互，CPU使用密集的批处理进程
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE5 //基本不用，重要性比较低，相对权重总是最小的

cpumask_t cpus_allowed;//位域，在多处理器上使用，用来限制进程可以在哪些CPU上运行
unsigned int time_slice;//循环实时调度器使用，完全公平调度器不用。所剩CPU时间段
...
}

调度器类
提供调度器和各个调度方法之间的关联。名字基本都叫出了作用。（不赘述，无非入队，出队等等）一些函数指针等等。
struct sched_class
{
const struct sched_class *next；
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
...
}

二解优先级
这里应该算是一个重头戏，怕太美，不忍心看，像件艺术品，像只昂贵的机械手表，从一个齿轮，一根表针，一点点拼装起来，
这个过程本身就是一种享受，但当花了很长时间组装起来后，你看着秒针跳动，齿轮旋转，擦了擦额头的汗水，油污并没有影响笑容。
然后发现唾液不由自主的分泌加快。忍不住咽了两口口水。我还没有开始分解就已经饥渴难耐了。算算时间，听会听力，明天晚上拆分linux内核优先级。

静态优先级
内核使用0-139（包含）来表示内部优先级，值越低优先级越高。0-99给实时进程用，100-139刚好映射进程40个nice值（（-20）-19）给普通进程用。
显然实时进程总是比普通进程优先级高。
//Priority of a process goes from 0..MAX_PRIO-1 使用0-139来表示优先级
//RT priority is 0..MAX_RT_PRIO-1实时进程从0..（100-1）
//This allows kernel threads to set their priority to a value higher than any user task 内核线程可以把优先级设置的比用户进程高
#define MAX_USER_RT_PRIO100
#define MAX_RT_PRIOMAX_USER_RT_PRIO

#define MAX_PRIO(MAX_RT_PRIO + 40)//100+40
#define DEFAULT_PRIO(MAX_RT_PRIO + 20)

只用静态优先级是不够的，还必须考虑动态优先级（task_struct->prio），普通优先级task_struct->normal_prio，静态优先级task_struct->static_prio。静态优先级是起点。

计算函数 p->prio = effective_prio(p);

static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);//计算普通优先级，接下函数分解
/*
* If we are RT tasks or we were boosted to RT priority,
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/

if (!rt_prio(p->prio))//如果非实时进程，

return p->normal_prio; //返回普通优先级，即动态优先级=普通优先级
return p->prio;/实时进程，则（）返回动态优先级。。。估计别处会计算？
}
//计算普通优先级
static inline int normal_prio(struct task_struct *p)
{
int prio;

if (task_has_rt_policy(p))//若是实时进程
prio = MAX_RT_PRIO-1 - p->rt_priority;//100-1 - 实时进程优先级(就是这个实时进程的等级)
else
prio = __normal_prio(p);//就是返回静态优先级
return prio;
}

static inline int __normal_prio(struct task_struct *p)//为什么要额外曾经一个这样的函数，
//历史原因，在原来O（1）调度器中，计算涉及很多技巧性工作，

{

//检测提高交互式进程优先级，“惩罚”非交互，待深入，还是感谢现在调度器

return p->static_prio;
}

判断是否为实时进程的两种不同方法
static inline int rt_prio(int prio)//effective_prio()中调用，与100这个界限值比较，考虑到了后面的优先级反转情况
{
if (unlikely(prio < MAX_RT_PRIO))
return 1;
return 0;
}

static inline int task_has_rt_policy(struct task_struct *p) //normal_prio()中调用，是利用进程本源属性比较的
{
return rt_policy(p->policy);
}
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
return 1;
return 0;
}

优先级基本上是都算完了：
static_prio normal_prio prio
非实时进程 static_priostatic_prio static_prio
优先级题高的非实时 static_priostatic_prio prio不变
实时进程 static_prioMAX_RT_PRIO-1 - p->rt_priority prio不变

进程的重要性，不仅要考虑优先级，还要考虑task_struct->se.load的负荷权重

权重
struct load_weight
{
unsigned long weight, inv_weight;//另一个小插曲，内核不仅维护负荷权重本身,另一个数值用于计算被负荷权重除的结果
//，long 类型，所以1/weight无法直接
};
进程每降低一个nice值，则多获得10%的CPU时间，每升高一个nice值则放弃10%的时间。
此处说道nice值，估计仅仅在非实时进程中有用吧？

/*
* Nice levels are multiplicative, with a gentle 10% change for every
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
* nice 1, it will get ~10% less CPU time than another CPU-bound task
* that remained on nice 0.
*
* The "10% effect" is relative and cumulative: from _any_ nice level,
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
* If a task goes up by ~10% and another task goes down by ~10% then
* the relative distance between them is ~25%.)
*/
static const int prio_to_weight[40] = {
/* -20 */ 88761, 71755, 56483, 46273, 36291,
/* -15 */ 29154, 23254, 18705, 14949, 11916,
/* -10 */ 9548, 7620, 6100, 4904, 3906,
/* -5 */ 3121, 2501, 1991, 1586, 1277,
/* 0 */ 1024, 820, 655, 526, 423,
/* 5 */ 335, 272, 215, 172, 137,
/* 10 */ 110, 87, 70, 56, 45,
/* 15 */ 36, 29, 23, 18, 15,
};
/*
* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
*
* In cases where the weight does not change often, we can use the
* precalculated inverse to speed up arithmetics by turning divisions
* into multiplications:
*/
static const u32 prio_to_wmult[40] = {
/* -20 */ 48388, 59856, 76040, 92818, 118348,
/* -15 */ 147320, 184698, 229616, 287308, 360437,
/* -10 */ 449829, 563644, 704093, 875809, 1099582,
/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

//此段大意，CPU的nice值下降一级，那么将多获得10%的CPU时间，而这个10%中有一个相对(relative)的概念
//来举个例子吧，只有AB两个进程在运行，nice值原本都是0，知权值load都为1024，
//则A进程1024/（1024+1024）=50%的CPU，想象下若要拉开10%的差距，此消彼长,则A=55%，B=45%；
//若此时A的nice值不变，即权值不变，B的nice值上升一级，权值该便多少呢？这就是我们这个权值表的由来了？
//1024/(1024+B的权值) ≈ 55%；计算出来的1024/0.55 - 1024 = 837左右，可见于820相差并不大。
//而内核直接用1/（1+1.25）≈ 0.4444；取1.25这个基数，这个约等于放大就是我们上面的情况了。
//应该是从0这个nice值对应1024分别向两边扩展的

转换代码要考虑实时进程，实时进程的权重是普通进程的2倍，SCHED_IDLE进程权重总是非常小，前面也提到过

/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
* each task makes to its run queue's load is weighted according to its
* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
* scaled version of the new time slice allocation that they receive on time
* slice expiry etc.
*/
//最后一句说这对于SCHED_NORMAL（普通进程）来说，权值是一种有鳞（有层次）的新时间片分配方法。

#define WEIGHT_IDLEPRIO2
#define WMULT_IDLEPRIO(1 << 31)

static void set_load_weight(struct task_struct *p)
{
if (task_has_rt_policy(p)) {
p->se.load.weight = prio_to_weight[0] * 2;//实时进程等于普通进程的最大权值*2
p->se.load.inv_weight = prio_to_wmult[0] >> 1;//这个反转，为了不常变（does not change often），快速计算
return;
}

/*
* SCHED_IDLE tasks get minimal weight://SCHED_IDLE进程权重总是非常小
*/
if (p->policy == SCHED_IDLE) {
p->se.load.weight = WEIGHT_IDLEPRIO;
p->se.load.inv_weight = WMULT_IDLEPRIO;
return;
}

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; //普通进程的计算方法
p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
}
//每次进程被加到就绪队列时，内核会调用inc_nr_running(),这不仅确保就绪队列能跟踪记录有多少进程在运行，
//而且还将进程的权重添加到就绪队列的权重中
static void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
inc_load(rq, p);
}

核心调度器
1.周期性调度器
如果当前进程应该被重新调度，那么会在task_struct中设置TIF_NEED_RESCHED标志
2.核心调度器
__sched schedule（）{}； //__sched这个函数前缀用于可能调用schedule（）函数的函数

完全公平类调度（重要）

核心调度器知道有关完全公平调度器的所有信息
/*
* All the scheduling class methods:
*/
static const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
...
}

CFS的数据结构
/* CFS-related fields in a runqueue */
struct cfs_rq
{
struct load_weight load;
unsigned long nr_running;//计算队列上可运行进程的数目

u64 min_vruntime;//跟踪队列上所有进程的最小虚拟运行时间，可能比红黑树最左边的树节点的vruntime大

struct rb_root tasks_timeline;//用于按时间排序的红黑树中管理所有进程
...//省略一些关于组调度的信息

}
完全公平类算法依赖于虚拟时钟，但在数据结构中并没有这个变量，是因为虚拟时钟可以根据实际时钟跟负荷权重算出来。所以命名为虚拟时钟。
计算虚拟时钟的函数是update_curr()

static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;//确认就绪队列的当前执行进程
u64 now = rq_of(cfs_rq)->clock;//获取主调度器就绪队列额实际时钟值
unsigned long delta_exec;

if (unlikely(!curr))//如果就绪队列上没进程正在执行，无事可做，返回
return;

/*
* Get the amount of time the current task was running
* since the last time we changed load (this cannot
* overflow on 32 bits):
*/
delta_exec = (unsigned long)(now - curr->exec_start);// 可以理解为if的else，负责内核计算当前和上一次负荷权重变化时的时间差

__update_curr(cfs_rq, curr, delta_exec);// 更新当前进程CPU话费的物理时间和虚拟时间
curr->exec_start = now;

}

__update_curr(cfs_rq, curr, delta_exec)
__update_curr()
{
unsigned long delta_exec_weighted;
u64 vruntime;

schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));

curr->sum_exec_runtime += delta_exec;//物理时间比较好算，直解把时间差加进来就可以了
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = delta_exec;// 对于运行在nice级别0的进程来说，定义虚拟时间权重和物理时间是的等等
if (unlikely(curr->load.weight != NICE_0_LOAD)) {
delta_exec_weighted = calc_delta_fair(delta_exec_weighted,// 计算其他nice值得，小块执行权值
&curr->load);
}
curr->vruntime += delta_exec_weighted;

/*
* maintain cfs_rq->min_vruntime to be a monotonic increasing
* value tracking the leftmost vruntime in the tree.
*/
if (first_fair(cfs_rq)) {
vruntime = min_vruntime(curr->vruntime,
__pick_next_entity(cfs_rq)->vruntime);
} else
vruntime = curr->vruntime;

cfs_rq->min_vruntime =
max_vruntime(cfs_rq->min_vruntime, vruntime);//确保min_vruntime 只会增加不会减少
}

参考了《深入理解linux内核架构》
calc_delta_fair(delta_exec_weighted, &curr->load);
delta_exec
calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
{
有点绕哦；
}
delta_exec_weighted = delta_exec * NICE_0_LOAD/curr->load.weight;
//需指定越重要的进程权值越大，那么 delta_exec_weighted 就小，即虚拟运行时间curr->vruntime += delta_exec_weighted;就增加慢。
//注意这都是针对非实时进程的，以上算是证明了越重要的进程，虚拟运行时间增加的越慢，那么就越靠近左边，下次运行机会大。

//那么原始的curr->exec_start 在哪里设置呢？
delta_exec = (unsigned long)(now - curr->exec_start);
/*
* We are picking a new current task - update its stats:
*/
static inline void
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* We are starting a new run period:
*/
se->exec_start = rq_of(cfs_rq)->clock;//rq_of是一个辅助函数，用去确定与CFS就绪队列相关的struct rq实例，看样子这个clock设置进程开始的时间
}
//然后delta_exec = (unsigned long)(now - curr->exec_start);
//更新一下后curr->exec_start = now;

//内核设置min_vruntime必须保证该值是单调递增的。

核心思路来了：红黑树的排序过程是根据下列键进行排序的
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return se->vruntime - cfs_rq->min_vruntime;
}
//键值较小的点，排序的位置越考左，因此会被更快地调度。
1.在进程运行时，其vruntime稳定地增加，他在红黑树中总是向右移动。
2.如果进程进入睡眠，则vruntime保持不变。因为每个队列的min_vruntime保持增加。所以睡醒后，在红黑树的位置会考左，因为键值减小了。

//周期性调度器
//电量不足的时候，可关闭
主要作用：
1.管理调度相关的统计量
2.激活负责当前进程的调度类的周期性调度方法。

完全公平调度器中不再存在所谓时间片概念，这个运行时间是变化的，跟权值，可运行进程数目都有关系
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.

void scheduler_tick(void)//整体看起来比二代里面内容少多了
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
u64 next_tick = rq->tick_timestamp + TICK_NSEC;

spin_lock(&rq->lock);
__update_rq_clock(rq);
/*
* Let rq->clock advance by at least TICK_NSEC:
*/
if (unlikely(rq->clock < next_tick))
rq->clock = next_tick;
rq->tick_timestamp = rq->clock;//更新时间戳
update_cpu_load(rq);
if (curr != rq->idle) /* FIXME: needed? */
curr->sched_class->task_tick(rq, curr);//实现方式取决于底层的调度器类。.task_tick = task_tick_fair,= task_tick_idle,task_tick_rt,
spin_unlock(&rq->lock);//先看task_tick_fair

}

//先看task_tick_fair，形式上俺负责
static void task_tick_fair(struct rq *rq, struct task_struct *curr)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;

for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se);//实际上交由本函数负责
}
}
//真正干活的
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);// 更新统计量

if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) //如果可运行进程多于1个，就会抢占，负责是什么都不干的
check_preempt_tick(cfs_rq, curr);
}
//可以抢占时候，做什么，确保没有哪个进程能够比延迟周期中确定的份额运行得更长。
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;

ideal_runtime = sched_slice(cfs_rq, curr);//计算完美时间
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime)//如果运行时间大于，之前计算出来的理想时间（即超出了延时限制）
resched_task(rq_of(cfs_rq)->curr);
}

//计算完美时间，可以随着可运行进程数目的多少来弹性变化。
/*
* The idea is to set a period in which each task runs once.
*
* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
*/
static u64 __sched_period(unsigned long nr_running)
{
u64 period = sysctl_sched_latency;
unsigned long nr_latency = sched_nr_latency;

if (unlikely(nr_running > nr_latency)) {
period *= nr_running;
do_div(period, nr_latency);
}

return period;
}

实时调度类
实时进程与普通进程有一个根本不同之处：如果系统有一个实时进程且可运行，那么调度器总会选择它运行，出发有一个优先级更高的实时进程。
循环进程（SCHED_RR）。
先进先出（SCHED_FIFO）.

比较简单，就是选优先级比较高的运行。

linux调度器_第三代cfs(4)_总手稿_完结篇

Python实现大麦网抢票的四大关键技术点解析

salesforce零基础学习（一百三十八）零碎知识点小总结（十）

條款25:考慮寫一個不拋一場的swap函數

k8s學習記錄1_組件說明

k8s學習記錄3_daemonSet, job, 服務發現

c++知識點_lambda的好處

Xgboost的優點分析

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結