前面主要分析的是新建一个进程时,内核在调度方面进行的处理。 现在来看当时钟中断时会发生的动作。 当时钟中断发生时,顺着中断处理程序调用路线,会调用到scheduler_tick()
/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * * It also gets called by the fork code, when changing the parent's * timeslices. */void scheduler_tick(void){ int cpu = smp_processor_id();//得到当前cpu的id struct rq *rq = cpu_rq(cpu);//得到该id所对应的cpu所拥有的可运行队列,上述两个变量都是per_cpu变量(抽时间专门写一篇关于per_cpu变量的日志) struct task_struct *curr = rq->curr;//得到当前进程 sched_clock_tick();//如果有定义CONFIG_HAVE_UNSTABLE_SCHED_CLOCK,则此函数为空,否则此函数会统计一些调度器信息,与调度特定功能关系不大,不作分析 spin_lock(&rq->lock);//加自旋锁,因为整个系统可能不只有一个时钟,如果此时其它cpu空闲,则会从其它cpu的可运行队列中挑选进程"pull"过去,所以加锁防止竞争 update_rq_clock(rq);//更新可运行队列的时间,具体实现太过于底层,大概是从cpu的某个寄存器中将晶振值读出 update_cpu_load(rq);//更新cpu负载信息,具体实现好像没有很严格的逻辑,有点经验的味道,以后如果用到再回过来仔细分析 curr->sched_class->task_tick(rq, curr, 0);//对于curr进程所在的sched_class,执行其对应的时钟处理函数 spin_unlock(&rq->lock);//释放自旋锁#ifdef CONFIG_SMP//与负载平衡有关,在后面重点分析负载平衡的日志中分析 rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu);#endif}
对于属于cfs调度类的进程,其task_tick函数为task_tick_fair()
/* * scheduler tick hitting a task of our scheduling class: */static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued){ struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; for_each_sched_entity(se) {//与组调度有关,最新版本的内核已经抛弃了组调度的概念,这里可以认为这个循环只执行一次 cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued);//真正的处理是这个函数完成的 }}
static voidentity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued){ /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq);//更新当前进程的虚拟时间,在前面的日志中已经有过分析#ifdef CONFIG_SCHED_HRTICK//忽略 /* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ if (queued) { resched_task(rq_of(cfs_rq)->curr); return; } /* * don't let the period tick interfere with the hrtick preemption */ if (!sched_feat(DOUBLE_TICK) && hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return;#endif if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))//检查当前进程是否可以被抢占 check_preempt_tick(cfs_rq, curr);}
接下来到重点了
/* * Preempt the current task with a newly woken task if needed: */static voidcheck_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr){ unsigned long ideal_runtime, delta_exec; ideal_runtime = sched_slice(cfs_rq, curr);//计算当前进程应该运行的 实际时间(wall-time) delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime)//如果运行时间已经超过分配值,置调度位,在时钟中断返回用户态时将执行调度 resched_task(rq_of(cfs_rq)->curr);}
时钟中断经常发生,将每个进程所应得的运行时间直接存入进程结构体不行么?不是会更省时间么?是的,以前的内核中确实是这样做的,可是这样做有一个缺点,就是一个进程的可运行时间在创建之初就已被确定(O(1)调度算法中虽然可以根据进程行为启发的去调整时间片,但本质上还没有解决问题),每次都去计算的方法,则会根据队列中的进程数量,动态的去分配时间,更为合理(因为如果进程数目一直在上升,显然调度周期需要适当的缩短)。至于时间的计算,比较简单,只给出代码,不作注释,但是,这里为什么要用实际时间呢?用虚拟时间不行么?
/* * We calculate the wall-time slice from the period by taking a part * proportional to the weight. * * s = p*P[w/rw] */static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se){ unsigned long nr_running = cfs_rq->nr_running; if (unlikely(!se->on_rq)) nr_running++; return calc_delta_weight(__sched_period(nr_running), se);}
/* * delta *= P[w / rw] */static inline unsigned longcalc_delta_weight(unsigned long delta, struct sched_entity *se){ for_each_sched_entity(se) { delta = calc_delta_mine(delta, se->load.weight, &cfs_rq_of(se)->load); } return delta;}
可见中断处理的过程其实很简单,主要就是更新当前进程的运行时间信息,然后与理想值进行比较,如果发现已经超时就置调度位,待从内核态回到用户态时执行调度。 再来看若进程主动调用schedule放弃cpu,也就是显式调用时的动作。
/* * schedule() is the main scheduler function. */asmlinkage void __sched schedule(void){ struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu;need_resched: preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu);//这个没弄清楚是什么作用,不过可以知道是个per cpu变量,查了下资料也没有查到啥,希望有高人解释 prev = rq->curr; switch_count = &prev->nivcsw; release_kernel_lock(prev);need_resched_nonpreemptible: schedule_debug(prev); if (sched_feat(HRTICK)) hrtick_clear(rq); spin_lock_irq(&rq->lock); update_rq_clock(rq); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely(signal_pending_state(prev->state, prev)))//如果有信号发送到当前进程,那么再给它一次机会,先不将它移出可运行队列 prev->state = TASK_RUNNING; else deactivate_task(rq, prev, 1);//将进程移出可运行队列 switch_count = &prev->nvcsw; }#ifdef CONFIG_SMP if (prev->sched_class->pre_schedule)//对于cfs调度类,此函数不存在 prev->sched_class->pre_schedule(rq, prev);#endif if (unlikely(!rq->nr_running))//如果队列中已经没有进程,那么执行一次平衡操作 idle_balance(cpu, rq); prev->sched_class->put_prev_task(rq, prev);// next = pick_next_task(rq, prev); if (likely(prev != next)) { sched_info_switch(prev, next); rq->nr_switches++; rq->curr = next; ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); } else spin_unlock_irq(&rq->lock); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched;}EXPORT_SYMBOL(schedule);
deactivate_task的代码如下:
/* * deactivate_task - remove a task from the runqueue. */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++;//更新统计信息 dequeue_task(rq, p, sleep);//调用对应调度类的相应函数,将进程移出 dec_nr_running(p, rq);//更新可运行进程数目 }
dequeue_task的代码如下:
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { if (sleep && p->se.last_wakeup) {//等分析被唤醒时的情景时再来看 update_avg(&p->se.avg_overlap, p->se.sum_exec_runtime - p->se.last_wakeup); p->se.last_wakeup = 0; } sched_info_dequeued(p);//条件编译,对调度特性无影响 p->sched_class->dequeue_task(rq, p, sleep);//真正的要删除了 p->se.on_rq = 0;//标记为该进程已不在可运行队列中 }
对于cfs调度类,dequeue_task所对应的函数为dequeue_task_fair(从命名规则也可以猜出应该这个这函数:)
/* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep){ struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) {//如果不考虑组调度的情况(新版本已经没有组调度的概念),这个循环只会执行一次 cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, sleep);..终于找到了删除动作 /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; sleep = 1; } hrtick_update(rq);}
继续往里走,看dequeue_entity的代码
static voiddequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep){ /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq);//更新运行信息 update_stats_dequeue(cfs_rq, se);//更新进程的一些信息,如果用到再仔细分析 if (sleep) {#ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) se->sleep_start = rq_of(cfs_rq)->clock; if (tsk->state & TASK_UNINTERRUPTIBLE) se->block_start = rq_of(cfs_rq)->clock; }#endif } clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se);//主要更新负载信息 update_min_vruntime(cfs_rq);}
clear_buddies这个函数会和别的概念有联系,等下单独分析。 重点看这句
if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se);
嗯?不对吧?se不就是前面传过来的的prev么?也就是当前cpu上正在运行的进程(不一定就是cfs里的current进程),如果不是,才将其从红黑树里移除? 删除之后,更新cpu的负载信息,更新cfs队列的负载信息,更新队列的可运行进程数目,将被删除进程标记为已不在可运行队列中
static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); list_del_init(&se->group_node); } cfs_rq->nr_running--; se->on_rq = 0; }
转了这么远后,回来继续schedule()函数的执行,也就是说,将当前进程移出可运行队列后,如果队列为空将执行一次cpu之间的负载平衡操作,然后执行以下语句
prev->sched_class->put_prev_task(rq, prev);
同样,对于cfs调度类,put_prev_task对应put_prev_task_fair,
/* * Account for a descheduled task: */static void put_prev_task_fair(struct rq *rq, struct task_struct *prev){ struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); put_prev_entity(cfs_rq, se); }}
同样忽略组调度,进入put_prev_entity
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev){ /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: */ if (prev->on_rq) update_curr(cfs_rq); check_spread(cfs_rq, prev); if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); } cfs_rq->curr = NULL;
按照前面的分析,在这个情景中,这时prev->on_rq已经为零了,所以这个函数实际上只是把cfs_rq->currl置空。 再次回到schedule的主路线中,当前进程已经被移走了,接下来就要选择一个新的进程来执行了
prev->sched_class->put_prev_task(rq, prev); next = pick_next_task(rq, prev);
/* * Pick up the highest-prio task: */static inline struct task_struct *pick_next_task(struct rq *rq, struct task_struct *prev){ const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ if (likely(rq->nr_running == rq->cfs.nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } class = sched_class_highest; for ( ; ; ) { p = class->pick_next_task(rq); if (p) return p; /* * Will never be NULL as the idle class always * returns a non-NULL p: */ class = class->next; }}
代码注释写的非常明白,我们进入到cfs调度类的代码中去看一下是怎么pick up的(其函数名想必已经能猜出来了:)
static struct task_struct *pick_next_task_fair(struct rq *rq){ struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; if (unlikely(!cfs_rq->nr_running)) return NULL; do { se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); hrtick_start_fair(rq, p); return p;}
再来看pick_next_entity
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq){ struct sched_entity *se = __pick_next_entity(cfs_rq); if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)//这个留着下面分析 return cfs_rq->next; if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) return cfs_rq->last; return se;}!
居然还有更深一层~
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq){ struct rb_node *left = cfs_rq->rb_leftmost; if (!left) return NULL; return rb_entry(left, struct sched_entity, run_node);}
唔,果然是返回红黑树的最左结点 选择到最左叶子结点后,执行set_next_entity
static voidset_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se){ /* 'current' is not kept within the tree. */ if (se->on_rq) { /* * Any task has to be enqueued before it get to execute on * a CPU. So account for the time it spent waiting on the * runqueue. */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se;#ifdef CONFIG_SCHEDSTATS /* * Track our maximum slice length, if the CPU's load is at * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->slice_max = max(se->slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); }#endif se->prev_sum_exec_runtime = se->sum_exec_runtime;}
哦,将他从树中拿出来了,并更新它的prev_sum_exec_runtime信息,(还记得吧,在时钟中断的处理中,被调入后,当前进程的运行时间就是用now值减去这个prev_sum_exec_runtime的,然后判断是否需要被换出) 终于,pick up的工作完成了,旧的进程也已经从队列中移除了,下面可以完成交接工作,来让新的进程运行了,由于其与调度器特性无关,我们暂不分析。 分析了新建进程,进程主动放弃cpu,现在只剩下一种情景没分析了,那就是被唤醒,其实在那种情景下,如果不考虑SMP平衡负载的话,其实只做了一件事,就是将所对应进程加入可运行队列,更新可运行进程数,更新负载,task state置TASK_RUNNING,然后检查是否可以抢占当前进程,如是则进行置位。就是这样而已,相当简单,只给出片断:
update_rq_clock(rq); activate_task(rq, p, 1); success = 1;out_running: trace_sched_wakeup(rq, p); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING;