[收藏]时钟中断（Timer Interrupt）与 Linux内核调度

转载原因：通俗易懂，逻辑清晰。

6. 时钟中断（Timer Interrupt）
时钟中断是系统中调度和抢占的驱动因素，在时钟中断中会进行进程运行时间的更新等，并更新调度标志，以决定是否进行调度。下面以Powerpc FSL Booke架构芯片ppce500为例来看具体代码，其他架构类似，设计思想相同。

6.1 时钟中断的注册
首先在系统最开始的启动阶段注册中断处理函数，这个过程发生在start_kernel执行之前的汇编初始化部分，在系统初始化完成后时钟中断发生时执行中断回调函数。

IBM的PowerPC架构的内核启动入口head文件在arch/powerpc/kernel/下，其中e500架构的内核入口文件为head_fsl_booke.S，其中定义了中断向量列表：

interrupt_base:
    /* Critical Input Interrupt */
    CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
    ......
    
    /* Decrementer Interrupt */
    DECREMENTER_EXCEPTION
    ......

时钟中断的定义为DECREMENTER_EXCEPTION，实际展开过程在arch/powerpc/kernel/head_booke.h头文件中：

#define DECREMENTER_EXCEPTION                              \
    START_EXCEPTION(Decrementer)                          \
    NORMAL_EXCEPTION_PROLOG(DECREMENTER);              \
    lis     r0,TSR_DIS@h;           /* Setup the DEC interrupt mask */    \
    mtspr   SPRN_TSR,r0;        /* Clear the DEC interrupt */          \
    addi    r3,r1,STACK_FRAME_OVERHEAD;                      \
    EXC_XFER_LITE(0x0900, timer_interrupt)

DECREMENTER_EXCEPTION -> EXC_XFER_LITE

#define EXC_XFER_LITE(n, hdlr)        \
    EXC_XFER_TEMPLATE(hdlr, n+1, MSR_KERNEL, NOCOPY, transfer_to_handler, \
              ret_from_except)

EXC_XFER_LITE -> EXC_XFER_TEMPLATE

#define EXC_XFER_TEMPLATE(hdlr, trap, msr, copyee, tfer, ret)    \
    li    r10,trap;                    \
    stw    r10,_TRAP(r11);                    \
    lis    r10,msr@h;                    \
    ori    r10,r10,msr@l;                    \
    copyee(r10, r9);                    \
    bl    tfer;                         \
    .long    hdlr;                        \
    .long    ret

再来看timer_interrupt函数：

/*
 * timer_interrupt - gets called when the decrementer overflows,
 * with interrupts disabled.
 */
void timer_interrupt(struct pt_regs * regs)
{
    struct pt_regs *old_regs;
    u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
 
    /* Ensure a positive value is written to the decrementer, or else
     * some CPUs will continue to take decrementer exceptions.
     */
    set_dec(DECREMENTER_MAX);
 
    /* Some implementations of hotplug will get timer interrupts while
     * offline, just ignore these and we also need to set
     * decrementers_next_tb as MAX to make sure __check_irq_replay
     * don't replay timer interrupt when return, otherwise we'll trap
     * here infinitely :(
     */
    if (!cpu_online(smp_processor_id())) {
        *next_tb = ~(u64)0;
        return;
    }
 
    /* Conditionally hard-enable interrupts now that the DEC has been
     * bumped to its maximum value
     */
    may_hard_irq_enable();
 
 
#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
    if (atomic_read(&ppc_n_lost_interrupts) != 0)
        do_IRQ(regs);
#endif
 
    old_regs = set_irq_regs(regs);
    irq_enter();
 
    __timer_interrupt();
    irq_exit();
    set_irq_regs(old_regs);
}

timer_interrupt() -> __timer_interrupt()函数

static void __timer_interrupt(void)
{
    struct pt_regs *regs = get_irq_regs();
    u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
    struct clock_event_device *evt = this_cpu_ptr(&decrementers);
    u64 now;
 
    trace_timer_interrupt_entry(regs);
 
    if (test_irq_work_pending()) {
        clear_irq_work_pending();
        irq_work_run();
    }
 
    now = get_tb_or_rtc();
    if (now >= *next_tb) {
        *next_tb = ~(u64)0;
        if (evt->event_handler)
            evt->event_handler(evt);
        __this_cpu_inc(irq_stat.timer_irqs_event);
    } else {
        now = *next_tb - now;
        if (now <= DECREMENTER_MAX)
            set_dec((int)now);
        /* We may have raced with new irq work */
        if (test_irq_work_pending())
            set_dec(1);
        __this_cpu_inc(irq_stat.timer_irqs_others);
    }
 
#ifdef CONFIG_PPC64
    /* collect purr register values often, for accurate calculations */
    if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
        struct cpu_usage *cu = this_cpu_ptr(&cpu_usage_array);
        cu->current_tb = mfspr(SPRN_PURR);
    }
#endif
 
    trace_timer_interrupt_exit(regs);
}

在__timer_interrupt函数中执行了evt->event_handler函数调用，此处event_handler是什么，究竟是怎么注册的呢？

答案：tick_handle_periodic的注册和执行流程如下：

start_kernel->time_init->init_decrementer_clockevent->register_decrementer_clockevent->clockevents_register_device->tick_check_new_device->tick_setup_periodic->tick_set_periodic_handler->tick_handle_periodic->tick_periodic->update_process_times->scheduler_tick

tick_handle_periodic 该函数实际上为中断事件真正的处理过程，前面的interrupt handler仅仅是为中断做一些准备工作，如完成寄存器等相关信息的保存等，做好了入口工作，二下面的event_handler则完成了中断事件实际想做的事情，其函数定义如下：

/*
 * Event handler for periodic ticks
 */
void tick_handle_periodic(struct clock_event_device *dev)
{
    int cpu = smp_processor_id();
    ktime_t next = dev->next_event;
 
    tick_periodic(cpu);
 
#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
    /*
     * The cpu might have transitioned to HIGHRES or NOHZ mode via
     * update_process_times() -> run_local_timers() ->
     * hrtimer_run_queues().
     */
    if (dev->event_handler != tick_handle_periodic)
        return;
#endif
 
    if (!clockevent_state_oneshot(dev))
        return;
    for (;;) {
        /*
         * Setup the next period for devices, which do not have
         * periodic mode:
         */
        next = ktime_add(next, tick_period);
 
        if (!clockevents_program_event(dev, next, false))
            return;
        /*
         * Have to be careful here. If we're in oneshot mode,
         * before we call tick_periodic() in a loop, we need
         * to be sure we're using a real hardware clocksource.
         * Otherwise we could get trapped in an infinite
         * loop, as the tick_periodic() increments jiffies,
         * which then will increment time, possibly causing
         * the loop to trigger again and again.
         */
        if (timekeeping_valid_for_hres())
            tick_periodic(cpu);
    }
}

update_process_times函数里：-> scheduler_tick()

/*
 * Called from the timer interrupt handler to charge one tick to the current
 * process.  user_tick is 1 if the tick is user time, 0 for system.
 */
void update_process_times(int user_tick)
{
        struct task_struct *p = current;
 
        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_tick();
#endif          
        scheduler_tick();
        if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                run_posix_cpu_timers(p);
}

scheduler_tick函数里：curr->sched_class->task_tick(rq, curr, 0);

/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 */
void scheduler_tick(void)
{
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
        struct task_struct *curr = rq->curr;
        struct rq_flags rf;
        
        sched_clock_tick();

        rq_lock(rq, &rf);
                
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        cpu_load_update_active(rq);
        calc_global_load_tick(rq);
        psi_task_tick(rq);

        rq_unlock(rq, &rf);

        perf_event_task_tick();

#ifdef CONFIG_SMP
        rq->idle_balance = idle_cpu(cpu);
        trigger_load_balance(rq);
#endif
}

可以看到在scheduler_tick中又调用了调度类的task_tick函数接口，如果当前采用CFS调度策略则执行fair_sched_class->task_tick，同样的在rt_sched_class中实现为task_tick_rt，实现如下：

static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
    struct sched_rt_entity *rt_se = &p->rt;
 
    update_curr_rt(rq);
 
    watchdog(rq, p);
 
    /*
     * RR tasks need a special form of timeslice management.
     * FIFO tasks have no timeslices.
     */
    if (p->policy != SCHED_RR)
        return;
 
    if (--p->rt.time_slice)
        return;
 
    p->rt.time_slice = sched_rr_timeslice;
 
    /*
     * Requeue to the end of queue if we (and all of our ancestors) are not
     * the only element on the queue
     */
    for_each_sched_rt_entity(rt_se) {
        if (rt_se->run_list.prev != rt_se->run_list.next) {
            requeue_task_rt(rq, p, 0);
            resched_curr(rq);
            return;
        }
    }
}

可以看到，如果当前时间片还未用完，则直接返回。代码片段：

    if (--p->rt.time_slice)
        return;

否则将进程实时时间片设置为sched_rr_timeslice，并且将调度实体的进程放置到调度队列rq的末尾，调用resched_curr设置调度信息后返回，这里实际上是实时调度的RR(Round Robin)思想。

现在又有新的问题，设置了进程的调度标志TIF_NEED_RESCHED之后，实际的调度何时发生呢？
一般调度的入口分为四个：

中断返回（用户态和内核态）；
系统调用返回用户空间；
进程主动放弃cpu，执行调度；
信号处理完成后返回内核空间；

时钟中断返回导致进程调度为第1种，此处以ppce500为例来看调度如何发生：
各种异常返回的入口RET_FROM_EXC_LEVEL，调用user_exc_return而进入do_work
而do_work作为总的入口点进入执行过程：

do_work:            /* r10 contains MSR_KERNEL here */
    andi.    r0,r9,_TIF_NEED_RESCHED
    beq    do_user_signal

可以看到，如果未设置调度标志，则会执行do_user_signal来restore_user返回之前的调用栈

do_user_signal:            /* r10 contains MSR_KERNEL here */
        ori    r10,r10,MSR_EE
        SYNC
        MTMSRD(r10)        /* hard-enable interrupts */
        /* save r13-r31 in the exception frame, if not already done */
        lwz    r3,_TRAP(r1)
        andi.    r0,r3,1
        beq    2f
        SAVE_NVGPRS(r1)
        rlwinm    r3,r3,0,0,30
        stw    r3,_TRAP(r1)
    2:    addi    r3,r1,STACK_FRAME_OVERHEAD
        mr    r4,r9
        bl    do_notify_resume
        REST_NVGPRS(r1)
        b    recheck

如果设置了调度标志，调用do_resched,，在entry_32.S中可以看到在函数do_resched中调用了schedule函数执行了调度：

do_resched:            /* r10 contains MSR_KERNEL here */
    /* Note: We don't need to inform lockdep that we are enabling
     * interrupts here. As far as it knows, they are already enabled
     */
    ori    r10,r10,MSR_EE
    SYNC
    MTMSRD(r10)        /* hard-enable interrupts */
    bl    schedule

定义在entry_32.S的recheck函数：

recheck:
    /* Note: And we don't tell it we are disabling them again
    * neither. Those disable/enable cycles used to peek at
    * TI_FLAGS aren't advertised.
    */
    LOAD_MSR_KERNEL(r10,MSR_KERNEL)
    SYNC
    MTMSRD(r10)        /* disable interrupts */
    CURRENT_THREAD_INFO(r9, r1)
    lwz    r9,TI_FLAGS(r9)
    andi.    r0,r9,_TIF_NEED_RESCHED
    bne-    do_resched
    andi.    r0,r9,_TIF_USER_WORK_MASK
    beq    restore_user

6.2 时钟中断的执行过程
在前面的中断向量定义中可以看到有一个处理过程为bl tfer;这里的tfer为transfer_to_handler或者transfer_to_handler_full，在时钟中断中为transfer_to_handler，主要做了一些中断处理函数调用之前的准备处理过程，然后跳转到中断执行过程hdlr，最后进入ret执行，ret对应函数ret_from_except或者ret_from_except_full，在时钟中断中对应为ret_from_except，进而调用resume_kernel后进入preempt_schedule_irq执行调度过程：

/*
 * this is the entry point to schedule() from kernel preemption
 * off of irq context.
 * Note, that this is called and return with irqs disabled. This will
 * protect us against recursive calling from irq.
 */
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
    enum ctx_state prev_state;
 
    /* Catch callers which need to be fixed */
    BUG_ON(preempt_count() || !irqs_disabled());
 
    prev_state = exception_enter();
 
    do {
        preempt_disable();
        local_irq_enable();
        __schedule(true);
        local_irq_disable();
        sched_preempt_enable_no_resched();
    } while (need_resched());
 
    exception_exit(prev_state);
}

接下来看看函数preempt_disable和local_irq_disable

static __always_inline volatile int *preempt_count_ptr(void)
{
    return &current_thread_info()->preempt_count;
}

其实关闭抢占只是将当前进程状态信息preempt_count增加相应的值1，在此调用之后又barrier()操作，防止编译器优化和内存访问顺序问题，达到同步的目的。

/*
 * Wrap the arch provided IRQ routines to provide appropriate checks.
 */
#define raw_local_irq_disable()        arch_local_irq_disable()
#define raw_local_irq_enable()        arch_local_irq_enable()
#define raw_local_irq_save(flags)            \
    do {                        \
        typecheck(unsigned long, flags);    \
        flags = arch_local_irq_save();        \
    } while (0)
#define raw_local_irq_restore(flags)            \
    do {                        \
        typecheck(unsigned long, flags);    \
        arch_local_irq_restore(flags);        \
    } while (0)
#define raw_local_save_flags(flags)            \
    do {                        \
        typecheck(unsigned long, flags);    \
        flags = arch_local_save_flags();    \
    } while (0)
#define raw_irqs_disabled_flags(flags)            \
    ({                        \
        typecheck(unsigned long, flags);    \
        arch_irqs_disabled_flags(flags);    \
    })
#define raw_irqs_disabled()        (arch_irqs_disabled())
#define raw_safe_halt()            arch_safe_halt()
 
#define local_irq_enable()    do { raw_local_irq_enable(); } while (0)
#define local_irq_disable()    do { raw_local_irq_disable(); } while (0)
#define local_irq_save(flags)                    \
    do {                            \
        raw_local_irq_save(flags);            \
    } while (0)
#define local_irq_restore(flags) do { raw_local_irq_restore(flags); } while (0)
#define safe_halt()        do { raw_safe_halt(); } while (0)

跟架构相关的irq操作定义如下：

static inline void arch_local_irq_restore(unsigned long flags)
{
#if defined(CONFIG_BOOKE)
    asm volatile("wrtee %0" : : "r" (flags) : "memory");
#else
    mtmsr(flags);
#endif
}

时钟中断属于硬件中断，Linux系统不支持中断嵌套，所以在中断发生时又会禁止本地中断（local_irq_disable）

arch_local_irq_xxx等函数

static inline unsigned long arch_local_irq_save(void)
{
    unsigned long flags = arch_local_save_flags();
#ifdef CONFIG_BOOKE
    asm volatile("wrteei 0" : : : "memory");
#else
    SET_MSR_EE(flags & ~MSR_EE);
#endif
    return flags;
}
 
static inline void arch_local_irq_disable(void)
{
#ifdef CONFIG_BOOKE
    asm volatile("wrteei 0" : : : "memory");
#else
    arch_local_irq_save();
#endif
}
 
static inline void arch_local_irq_enable(void)
{
#ifdef CONFIG_BOOKE
    asm volatile("wrteei 1" : : : "memory");
#else
    unsigned long msr = mfmsr();
    SET_MSR_EE(msr | MSR_EE);
#endif
}
 
static inline bool arch_irqs_disabled_flags(unsigned long flags)
{
    return (flags & MSR_EE) == 0;
}
 
static inline bool arch_irqs_disabled(void)
{
    return arch_irqs_disabled_flags(arch_local_save_flags());
}
 
#define hard_irq_disable()        arch_local_irq_disable()

[收藏]时钟中断（Timer Interrupt）与 Linux内核调度

猜你喜欢