schedule()函数详解

 asmlinkage __visible void __sched schedule(void)
 {
     struct task_struct *tsk = current;
 
     sched_submit_work(tsk);
     do {
         __schedule();
     } while (need_resched());
 }

schedule主要完成的工作内容如下：
（1）sched_submit_work用于检测当前进程是否有plugged io需要处理，由于当前进程执行schedule后，有可能会进入休眠，所以在休眠之前需要把plugged io处理掉放置死锁。
（2）执行__schedule()这个函数是调度的核心处理函数，当前CPU会选择到下一个合适的进程去执行了。
（3）need_resched()执行到这里时说明当前进程已经被调度器再次执行了，此时要判断是否需要再次执行调度。

 static void __sched __schedule(void)
 {
     struct task_struct *prev, *next;
     unsigned long *switch_count;
     struct rq *rq;
     int cpu;
 
     preempt_disable();
     cpu = smp_processor_id();
     rq = cpu_rq(cpu);
     rcu_note_context_switch();
     prev = rq->curr;
 
     schedule_debug(prev);
 
     if (sched_feat(HRTICK))
         hrtick_clear(rq);
 
     /*
      * Make sure that signal_pending_state()->signal_pending() below
      * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
      * done by the caller to avoid the race with signal_wake_up().
      */
     smp_mb__before_spinlock();
     raw_spin_lock_irq(&rq->lock);
 
     rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
     switch_count = &prev->nivcsw;
     if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
         if (unlikely(signal_pending_state(prev->state, prev))) {
             prev->state = TASK_RUNNING;
         } else {
             deactivate_task(rq, prev, DEQUEUE_SLEEP);
             prev->on_rq = 0;
 
             /*
              * If a worker went to sleep, notify and ask workqueue
              * whether it wants to wake up a task to maintain
              * concurrency.
              */
             if (prev->flags & PF_WQ_WORKER) {
                 struct task_struct *to_wakeup;
 
                 to_wakeup = wq_worker_sleeping(prev, cpu);
                 if (to_wakeup)
                     try_to_wake_up_local(to_wakeup);
             }
         }
         switch_count = &prev->nvcsw;
     }
 
     if (task_on_rq_queued(prev))
         update_rq_clock(rq);
 
     next = pick_next_task(rq, prev);
     clear_tsk_need_resched(prev);
     clear_preempt_need_resched();
     rq->clock_skip_update = 0;
 
     if (likely(prev != next)) {
         rq->nr_switches++;
         rq->curr = next;
         ++*switch_count;
 
         rq = context_switch(rq, prev, next); /* unlocks the rq */
         cpu = cpu_of(rq);
     } else
         raw_spin_unlock_irq(&rq->lock);
 
     post_schedule(rq);
 
     sched_preempt_enable_no_resched();
 }

（1）内核中定义了一个每CPU变量（percpu）runqueue，这个runqueue中是用来管理当前CPU上的running状态的进程，本函数会先获取当前CPU上的runqueue，然后通过runqueue结构体获取当前正在运行进程的描述符。并把它赋值给prev。
（2）判断当前进程是否存在挂起的信号，如果有挂起的信号，那么保持当前进程为running状态，避免反复睡眠唤醒的资源消耗，否则当前进程制定dequeue操作进入休眠状态，并且判断当前进程是否为一个worker，如果是则在workqueue中查看是否有需要被唤醒的worker。
（3）经过前面的逻辑后判断当前进程prev是否依然存在于runqueue队列中，如果存在说明它并没有休眠下去，那么此时需要执行update_rq_clock，为什么休眠的进程就可以跳过此步，那是因为在休眠dequeue的时候会执行此步骤。
（4）执行pick_next_task选择下一个要执行的进程next，这一步实际上会根据sched_class类型的不同调用不同的回调，而不同的调度器实现的目的只有一个就是根据自己的算法找到下一个需要调度的进程。
（5）清除prev进程的tif_need_resched标志
（6）清除prev进程的preempt标志，清除禁止抢占标志
（7）更新runqueue中跟当前进程相关的数据到next进程
（8）执行context_switch执行上下文切换
（9）执行post_schedule后续的一些操作，根据不同sched class调用不同的post回调

 static inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
            struct task_struct *next)
 {
     struct mm_struct *mm, *oldmm;
 
     prepare_task_switch(rq, prev, next);
 
     mm = next->mm;
     oldmm = prev->active_mm;
     /*
      * For paravirt, this is coupled with an exit in switch_to to
      * combine the page table reload and the switch backend into
      * one hypercall.
      */
     arch_start_context_switch(prev);
 
     if (!mm) {
         next->active_mm = oldmm;
         atomic_inc(&oldmm->mm_count);
         enter_lazy_tlb(oldmm, next);
     } else
         switch_mm(oldmm, mm, next);
 
     if (!prev->mm) {
         prev->active_mm = NULL;
         rq->prev_mm = oldmm;
     }
     /*
      * Since the runqueue lock will be released by the next
      * task (which is an invalid locking op but in the case
      * of the scheduler it's an obvious special-case), so we
      * do an early lockdep release here:
      */
     spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
     context_tracking_task_switch(prev, next);
     /* Here we just switch the register state and the stack. */
     switch_to(prev, next, prev);
     barrier();
 
     return finish_task_switch(prev);
 }

context_switch函数是用于执行进程切换的，它的主要任务有：
（1）切换两个进程的虚拟地址空间，切换内存页表，通过函数switch_mm实现
（2）保存prev进程的栈信息和寄存器信息，然后恢复next进程栈信息和寄存器信息，通过switch_to实现
（3）prepare_task_switch和finish_task_switch成对出现，用于准备和清理进程切换相关的数据结构

内存空间切换

	if (!mm) {
         next->active_mm = oldmm;
         atomic_inc(&oldmm->mm_count);
         enter_lazy_tlb(oldmm, next);
     } else
         switch_mm(oldmm, mm, next);
 
     if (!prev->mm) {
         prev->active_mm = NULL;
         rq->prev_mm = oldmm;
     }

这部分之所以区分mm是否为空，主要是区分处理内核线程的，内核线程的mm为空，改用active_mm作为内存管理，其中包含了内核页表。对于mm为空的情况，直接把active_mm 设置为prev->active_mm，这就是设置的内核线程的地址空间。而对于用户进程，active_mm就被设置为等于mm，这一步是在fork的时候做的。
针对内核线程不用执行switch_mm，因为内核页表都应该是一样的，从而减少了开销。

CPU寄存器和栈切换

#define switch_to(prev, next, last)                 \
    do {                                \
        ((last) = __switch_to((prev), (next)));         \
    } while (0)

__switch_to是平台相关的实现，主要实现思想如下：
（1）先把当前进程的所有寄存器信息和栈信息（sp寄存器）存入当前进程的stack中
（2）从next进程中去除所有的寄存器信息以及栈信息（sp寄存器）

current进程的更新

register unsigned long current_stack_pointer asm ("sp");
 
static inline struct thread_info *current_thread_info(void)
{
    return (struct thread_info *)
        (current_stack_pointer & ~(THREAD_SIZE - 1));
}


#define get_current() (current_thread_info()->task)
#define current get_current()

每个进程都会有两个堆栈，这里我们只关注内核栈，内核预留了一部分空间作为每个进程的内核栈，

union thread_union {
    struct thread_info thread_info;
    unsigned long stack[THREAD_SIZE/sizeof(long)];
};

thread_info存放在预留空间stack的低地址位置。而stack高地址位置是内核栈底，sp寄存器是栈顶，内核栈是递减向下栈。

current进程的更新

猜你喜欢