【历程管理】进程调度

【进程管理】进程调度

本节研究进程调度的相关问题；

在多进程的OS中，进程调度是一个全局性的，关键性的问题；它对系统的总体设计，实现，功能设置，以及各方面的性能有着决定性的影响；根据调度结果所作出的进程切换

速度，也是衡量一个操作系统性能的重要指标；一个好的系统的进程调度进制，要兼顾着三种不同应用的需要：

（1）交互式应用；它偏重系统的响应速度，使用一个系统的各个用户（或各个应用程序）都能感觉到自己是独占一个OS；当响应延迟超过150ms，使用者就会明显的感觉到了；

（2）批处理应用；批处理应用往往是作为后台作业运行的，所以对响应速度并无要求，但是完成一个作业所需要的时间是一个重要因素；

（3）实时应用；不但要考虑响应时间（某个事件发生到系统对此作为反应开始执行有关程序之间所需要的时间），还要考虑执行时间（能否在规定时间内执行完）；最重要的是要对程序执行要有可预测性；

基本知识

（1）首先自愿调度随时都可以进行；一个进程可以通过schedule()启动一次调度，当然也可以在schedule()之前设置本进程的状态为TASK_INTERRUPTABLE或

TASK_UNINTERRUPTABLE，暂时放弃运行进入睡眠；在用户空间可通过系统调用pause()来达到同样的目的；也可以为暂时放弃CPU设置时间，如内核中有

schedule_timeout()；或用户空间的系统调用nanosleep()（sleep(）是库函数，最终也是调用这个系统调用）；内核中放弃运行是不可见的，而用户空间放弃运行是可见的；

（2）调度也是可以非自愿的，用户抢占发生在在每次从系统调用前夕，以及每次从中断或异常处理程序返回前夕（这些返回用户空间意味着当CPU运行在用户空间发生的中断

或异常才是可强制调度的）；而内核抢占（内核的某些部分只能由一个CPU执行，此时需要禁用内核抢占，这些不能不中断的点已经被SMP实现标识出来了；当threadinfo中的preempt_count为0时可以抢占，否则当前进程持有锁不允许抢占）发生在中断处理程序正在执行且返回到内核空间之前，内核代码再一次具有可抢占性的时候；这个限制也使得如果内核在系统空间多发生几次中断且preempt_count不为0时，不返回用户空间，可导致linux的实时性不好；在task_struct中有一个字段need_resched为非0时，会发生调度（可通过系统调用来设置，或调用中因某种原因受阻，或此进程运行时间太长了或唤醒了其他进程）；

（3）Linux内核的调度方式可以说是有条件的剥夺；当进程处于用户空间时，不管是否自愿，一旦有必要内核就会剥夺它调度而使其他进程调度；但是到了系统空间，一直要

到返回到用户空间的前夕才会剥夺其运行；调度政策主要是以优先级为基础的调度，挑选优先级最高的进程运行，而此进程会随着运行的时间而资格降低，到下一次调度的时

候，可能原来优先级较低的进程就可运行了；到所有进程的优先级均变为0时，再重新计算一次所有进程的资格；但是还以通过sched_setscheduler设置其他三种调度政策

（SCHED_FIFO适合时间性较强的，运行时间较短的，偏重实时进程，SCHED_RR轮流，适合运行时间比较长的进程，SCHED_OTHER适合传统的调度政策，交互式应用）；

task_struct中关于调度的信息

	int prio, static_prio, normal_prio; //分别表示三种优先级
	//动态（这个是调度器考虑的）
	//静态（进程启动时分配的，可用nice和sched_setscheduler更改）
	//普通（基于进程的静态优先级和调度策略计算的）
    
	unsigned int rt_priority;		//表示实时的优先级，最低为0，最高为99，越大表示越高

	const struct sched_class *sched_class;
	struct sched_entity se;			//可调度的实体
	struct sched_rt_entity rt;

	unsigned int policy;	//SCHED_NORMAL用于普通进程，完全公平调度器来处理，同时还有SCHED_BATCH批处理，计算密集型
							//SCHED_BATCH绝不会抢占CF调度器；
							//RR，FIFO用于软实时进程，由实时调度处理，不受限于完全公平调度器

	//task_has_rt_policy
	//
	cpumask_t cpus_allowed;	//用于设置进程可以在哪些CPU上运行

周期性调度

/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 *
 * It also gets called by the fork code, when changing the parent's
 * timeslices.
 */
//周期性调度，会使用HZ周期性调度该函数
void scheduler_tick(void)
{
	int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);		//当前cpu上的rq, 就绪队列
	struct task_struct *curr = rq->curr;

	sched_clock_tick();

	raw_spin_lock(&rq->lock);
	update_rq_clock(rq);		//就绪队列时钟更新
	update_cpu_load_active(rq);	//负责就绪队列中的cpu_load[]
	
	curr->sched_class->task_tick(rq, curr, 0);		//task_tick取决于底层的调度器类
	//完全公平调度器会在该方法中检测是否进程运行太长时间，以免过长的延迟，如果当前进程被要求重新调度，将其TIF_NEED_RESCHED设置为1，表示该请求，内核会在接下来合适的实际完成该请求；


	raw_spin_unlock(&rq->lock);

	perf_event_task_tick();

#ifdef CONFIG_SMP
	rq->idle_at_tick = idle_cpu(cpu);
	trigger_load_balance(rq, cpu);
#endif
}

返回到用户空间/内核空间时判断

    # userspace resumption stub bypassing syscall exit tracing  
    ALIGN  
    RING0_PTREGS_FRAME  
ret_from_exception:  
    preempt_stop(CLBR_ANY)  
ret_from_intr:      //返回现场  
    GET_THREAD_INFO(%ebp)  
check_userspace:  
    movl PT_EFLAGS(%esp), %eax  # mix EFLAGS and CS  
    movb PT_CS(%esp), %al  
    andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax  
    cmpl $USER_RPL, %eax      //如果发生在用户空间（系统调用和发生在用户空间的外部中断）  
    jb resume_kernel        # not returning to v8086 or userspace   //发生在内核空间  
  
ENTRY(resume_userspace)  
    LOCKDEP_SYS_EXIT  
    DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt  
                    # setting need_resched or sigpending  
                    # between sampling and the iret  
    TRACE_IRQS_OFF  
    movl TI_flags(%ebp), %ecx  
    andl $_TIF_WORK_MASK, %ecx  # is there any work to be done on  
                    # int/exception return?  
    jne work_pending  
    jmp restore_all  
END(ret_from_exception)  
  
#ifdef CONFIG_PREEMPT  
ENTRY(resume_kernel)  
    DISABLE_INTERRUPTS(CLBR_ANY)  
    cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?  //是否允许抢占  
    jnz restore_all    //不允许，原来被中断的内核程序将会恢复上下文后，继续执行  
need_resched:   //抢占  
    movl TI_flags(%ebp), %ecx   # need_resched set ?  
    testb $_TIF_NEED_RESCHED, %cl   //查看_TIF_NEED_RESCHED为0，没有要切换的进程  
    jz restore_all        //恢复  
    testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)    # interrupts off (exception path) ?  
    jz restore_all  
    call preempt_schedule_irq   //调度  
    jmp need_resched  
END(resume_kernel)

返回用户空间时的操作如下

work_pending:  
    testb $_TIF_NEED_RESCHED, %cl  
    jz work_notifysig   //有信号时，优先处理信号，然后才是调度
work_resched:  
    call schedule            //调度  
    LOCKDEP_SYS_EXIT  
    DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt  
                    # setting need_resched or sigpending  
                    # between sampling and the iret  
    TRACE_IRQS_OFF  
    movl TI_flags(%ebp), %ecx  
    andl $_TIF_WORK_MASK, %ecx  # is there any work to be done other  
                    # than syscall tracing?  
    jz restore_all  
    testb $_TIF_NEED_RESCHED, %cl  
    jnz work_resched  
  
 //处理信号  
work_notifysig:             # deal with pending signals and  
                    # notify-resume requests  
#ifdef CONFIG_VM86  
    testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)  
    movl %esp, %eax  
    jne work_notifysig_v86      # returning to kernel-space or  
                    # vm86-space  
    xorl %edx, %edx  
    call do_notify_resume  
    jmp resume_userspace_sig

内核调度处理

/*
 * this is the entry point to schedule() from kernel preemption
 * off of irq context.
 * Note, that this is called and return with irqs disabled. This will
 * protect us against recursive calling from irq.
 */
asmlinkage void __sched preempt_schedule_irq(void)
{
	struct thread_info *ti = current_thread_info();

	/* Catch callers which need to be fixed */
	BUG_ON(ti->preempt_count || !irqs_disabled());

	do {
		add_preempt_count(PREEMPT_ACTIVE);
		local_irq_enable();
		schedule();		//调度
		local_irq_disable();
		sub_preempt_count(PREEMPT_ACTIVE);

		/*
		 * Check again in case we missed a preemption opportunity
		 * between schedule and now.
		 */
		barrier();
	} while (need_resched());	//当前进程重调度位是打开的
}

主动调度

/*
 * schedule() is the main scheduler function.
 */
//主动调度
asmlinkage void __sched schedule(void)
{
	struct task_struct *prev, *next;
	unsigned long *switch_count;
	struct rq *rq;
	int cpu;

need_resched:
	preempt_disable();  //通过调用增加preempt_count来禁用内核抢占，
	cpu = smp_processor_id();
	rq = cpu_rq(cpu);		//本地就绪队列
	rcu_note_context_switch(cpu);
	prev = rq->curr;			//current

	release_kernel_lock(prev); //保证不占用大内核锁
need_resched_nonpreemptible:

	schedule_debug(prev);

	if (sched_feat(HRTICK))
		hrtick_clear(rq);

	raw_spin_lock_irq(&rq->lock);
//sys_nice
	switch_count = &prev->nivcsw;
	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { //是由抢占计数器引起的，直接去运行调度
		if (unlikely(signal_pending_state(prev->state, prev))) { //如果当前进程接收到信号,非阻塞的挂起信号
			prev->state = TASK_RUNNING;	//再给一次机会
		} else {
			/*
			 * If a worker is going to sleep, notify and
			 * ask workqueue whether it wants to wake up a
			 * task to maintain concurrency.  If so, wake
			 * up the task.
			 */
			if (prev->flags & PF_WQ_WORKER) {
				struct task_struct *to_wakeup;

				to_wakeup = wq_worker_sleeping(prev, cpu);
				if (to_wakeup)
					try_to_wake_up_local(to_wakeup);
			}
			deactivate_task(rq, prev, DEQUEUE_SLEEP);	//不处于可运行状态时，那就从当前运行队列中删除该进程
		}
		switch_count = &prev->nvcsw;
	}

	pre_schedule(rq, prev);

	if (unlikely(!rq->nr_running))	//如果当前运行的队列没有可运行的进程存在了，那就从从另外的运行队列中获取
		idle_balance(cpu, rq);

	put_prev_task(rq, prev);    //通知调度器类当前运行进程将要被另一个进程代替
	next = pick_next_task(rq);	//选择下一个将要运行的进程
	clear_tsk_need_resched(prev);  //清除当前进程重新调度标志TIF_NEED_RESCHED
	rq->skip_clock_update = 0;

	if (likely(prev != next)) {	//
		sched_info_switch(prev, next);
		perf_event_task_sched_out(prev, next);

		rq->nr_switches++;
		rq->curr = next;	//将要运行的进程
		++*switch_count;

		context_switch(rq, prev, next); /* unlocks the rq */	//体系结构相关的方法，负责具体底层的上下文切换
		/*
		 * The context switch have flipped the stack from under us
		 * and restored the local variables which were saved when
		 * this task called schedule() in the past. prev == current
		 * is still correct, but it can be moved to another cpu/rq.
		 */

		//此时当前进程正好在这以前停止运行，新进程已经接管了CPU，再此后前一进程被再次运行时，刚好会在这个点恢复执行:

		cpu = smp_processor_id();
		rq = cpu_rq(cpu);
	} else
		raw_spin_unlock_irq(&rq->lock);

	post_schedule(rq);

	if (unlikely(reacquire_kernel_lock(prev)))
		goto need_resched_nonpreemptible;

	preempt_enable_no_resched();
	if (need_resched())			//看TIF_NEED_RESCHED是否设置了，此时prev不会指向正确的值了，使用current来找
		goto need_resched;
}
EXPORT_SYMBOL(schedule);

上下文切换

//上下文切换
static inline void
context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next)
{
	struct mm_struct *mm, *oldmm;

	prepare_task_switch(rq, prev, next);
	trace_sched_switch(prev, next);
	mm = next->mm;
	oldmm = prev->active_mm;		//之前的active_mm
	/*
	 * For paravirt, this is coupled with an exit in switch_to to
	 * combine the page table reload and the switch backend into
	 * one hypercall.
	 */
	arch_start_context_switch(prev);

	if (!mm) {						//内核线程没有地址空间
		next->active_mm = oldmm;	//借来的
		atomic_inc(&oldmm->mm_count);
		enter_lazy_tlb(oldmm, next); //告诉底层体系结构不许要切换虚拟地址空间
	} else
		switch_mm(oldmm, mm, next);

	if (!prev->mm) {
		prev->active_mm = NULL;		//断开
		rq->prev_mm = oldmm;
	}
	/*
	 * Since the runqueue lock will be released by the next
	 * task (which is an invalid locking op but in the case
	 * of the scheduler it's an obvious special-case), so we
	 * do an early lockdep release here:
	 */
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif

	/* Here we just switch the register state and the stack. */
	switch_to(prev, next, prev);  //切换

	barrier();
	/*
	 * this_rq must be evaluated again because prev may have moved
	 * CPUs since it called schedule(), thus the 'rq' on its stack
	 * frame will be invalid.
	 */
	finish_task_switch(this_rq(), prev);	//完成一些清理，正确的释放锁
}

/* Save restore flags to clear handle leaking NT */
//last是中间进程，用来回复pre的执行流
#define switch_to(prev, next, last) \
	asm volatile(SAVE_CONTEXT					  \
	     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP *，先将esp的内容保存pre的thread.sp/	  \
	     "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP，然后将next的thread.sp替换到现在的esp中 */	  \
	     "call __switch_to\n\t"					  \
	     "movq "__percpu_arg([current_task])",%%rsi\n\t"		  \
	     __switch_canary						  \
	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
	     "movq %%rax,%%rdi\n\t" 					  \
	     "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"		  \
	     "jnz   ret_from_fork\n\t"					  \
	     RESTORE_CONTEXT						  \
	     : "=a" (last)					  	  \
	       __switch_canary_oparam					  \
	     : [next] "S" (next), [prev] "D" (prev),	//输入部，分别将next, pre放入到esi和edi寄存器			  \
	       [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \	
	       [ti_flags] "i" (offsetof(struct thread_info, flags)),	  \	//
	       [_tif_fork] "i" (_TIF_FORK),			  	  \
	       [thread_info] "i" (offsetof(struct task_struct, stack)),   \
	       [current_task] "m" (current_task)			  \//立即数保存对应的值
	       __switch_canary_iparam					  \
	     : "memory", "cc" __EXTRA_CLOBBER)
#endif