Linux内核学习011——进程管理(七)

进程终结

当一个进程终结时，内核必须释放其所占有的资源，并通知其父进程。

一般而言，进程的结束是由自身引起的。进程中介发生在进程调用exit()系统调用时，无论是显式调用或者是隐式地从某个程序的主函数返回。当进程终结时，主要时依靠do_exit()完成，该函数定义于Linux2.6.34//kernel/exit.c#L900。其主要完成以下工作：

将task_struct中的标志成员设置为PF_EXITING，以表示进程正在被删除
调用del_timer_sync()删除任一内核定时器，根据返回的结果，它确保没有定时器在排队，也没有定时器处理程序在运行
若BSD的进程记账功能开启，则调用acct_update_integrals()来输出记账信息
调用exit_mm()释放进程占用的mm_struct，若没有别的进程还在使用，则彻底释放
接着调用exit_sem()函数，若进程在排队等待IPC信号，则离开队列
调用exit_files()和exit_fs()，以分别递减文件描述符、文件系统数据的应用计数。若其中的某个计数降为0，则可以释放该资源、
接着将task_struct中的exit_code设置为exit()提供的退出值，或者去完成任何其他由内恶化机制规定的退出动作，退出代码存放在此处供父进程检索
调用exit_notify()向父进程发送信号，为本进程的子进程(若存在)寻找新的父进程，要么是线程组中的其他线程或者为init进程，并把进程状态设置为EXIT_ZOMBIE
调用schedule()切换到新的进程。因为处于EXIT_ZOMBIE状态的进程不会再被调度，所以这是进程执行的最后一段代码

do_exit()函数如下：


NORET_TYPE void do_exit(long code)
{
    struct task_struct *tsk = current;
    int group_dead;

    profile_task_exit(tsk);

    WARN_ON(atomic_read(&tsk->fs_excl));

    if (unlikely(in_interrupt()))
        panic("Aiee, killing interrupt handler!");
    if (unlikely(!tsk->pid))
        panic("Attempted to kill the idle task!");

    tracehook_report_exit(&code);

    validate_creds_for_do_exit(tsk);

    /*
     * We're taking recursive faults here in do_exit. Safest is to just
     * leave this task alone and wait for reboot.
     */
    if (unlikely(tsk->flags & PF_EXITING)) {
        printk(KERN_ALERT
            "Fixing recursive fault but reboot is needed!\n");
        /*
         * We can do this unlocked here. The futex code uses
         * this flag just to verify whether the pi state
         * cleanup has been done or not. In the worst case it
         * loops once more. We pretend that the cleanup was
         * done as there is no way to return. Either the
         * OWNER_DIED bit is set by now or we push the blocked
         * task into the wait for ever nirwana as well.
         */
        tsk->flags |= PF_EXITPIDONE;
        set_current_state(TASK_UNINTERRUPTIBLE);
        schedule();
    }

    exit_irq_thread();

    exit_signals(tsk);  /* sets PF_EXITING */
    /*
     * tsk->flags are checked in the futex code to protect against
     * an exiting task cleaning up the robust pi futexes.
     */
    smp_mb();
    raw_spin_unlock_wait(&tsk->pi_lock);

    if (unlikely(in_atomic()))
        printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
                current->comm, task_pid_nr(current),
                preempt_count());

    acct_update_integrals(tsk);
    /* sync mm's RSS info before statistics gathering */
    if (tsk->mm)
        sync_mm_rss(tsk, tsk->mm);
    group_dead = atomic_dec_and_test(&tsk->signal->live);
    if (group_dead) {
        hrtimer_cancel(&tsk->signal->real_timer);
        exit_itimers(tsk->signal);
        if (tsk->mm)
            setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
    }
    acct_collect(code, group_dead);
    if (group_dead)
        tty_audit_exit();
    if (unlikely(tsk->audit_context))
        audit_free(tsk);

    tsk->exit_code = code;
    taskstats_exit(tsk, group_dead);

    exit_mm(tsk);

    if (group_dead)
        acct_process();
    trace_sched_process_exit(tsk);

    exit_sem(tsk);
    exit_files(tsk);
    exit_fs(tsk);
    check_stack_usage();
    exit_thread();
    cgroup_exit(tsk, 1);

    if (group_dead)
        disassociate_ctty(1);

    module_put(task_thread_info(tsk)->exec_domain->module);

    proc_exit_connector(tsk);

    /*
     * FIXME: do that only when needed, using sched_exit tracepoint
     */
    flush_ptrace_hw_breakpoint(tsk);
    /*
     * Flush inherited counters to the parent - before the parent
     * gets woken up by child-exit notifications.
     */
    perf_event_exit_task(tsk);

    exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
    mpol_put(tsk->mempolicy);
    tsk->mempolicy = NULL;
#endif
#ifdef CONFIG_FUTEX
    if (unlikely(current->pi_state_cache))
        kfree(current->pi_state_cache);
#endif
    /*
     * Make sure we are holding no locks:
     */
    debug_check_no_locks_held(tsk);
    /*
     * We can do this unlocked here. The futex code uses this flag
     * just to verify whether the pi state cleanup has been done
     * or not. In the worst case it loops once more.
     */
    tsk->flags |= PF_EXITPIDONE;

    if (tsk->io_context)
        exit_io_context(tsk);

    if (tsk->splice_pipe)
        __free_pipe_info(tsk->splice_pipe);

    validate_creds_for_do_exit(tsk);

    preempt_disable();
    exit_rcu();
    /* causes final put_task_struct in finish_task_switch(). */
    tsk->state = TASK_DEAD;
    schedule();
    BUG();
    /* Avoid "noreturn function does return".  */
    for (;;)
        cpu_relax();    /* For when BUG is null */
}

至此，与进程相关联的所有资源都被释放掉了，此时进程不可运行，并处于EXIT_ZOMBIE退出状态。它此时仅占有的内存为内核栈、thread_info结构体和task_struct结构体。此时的进程是为了向父进程提供信息。当父进程检索到信息或者通知内核那是无关信息后，由进程所持有的剩余内存被释放。

删除进程描述符

在调用do_exit()之后，进程已经僵死不能运行了，但是系统还保留了它的进程描述符。因此，进程终结时所需的清理工作和进程描述符的删除是被分开执行的。

wait一族函数都是通过唯一的一个系统调用wait4()来实现的。wait4()会挂起调用它的进程，直到其中的一个子进程退出，此时函数会返回该子进程的PID。此外，调用该函数时提供的指针包含子函数退出时的退出代码。

当最终需要释放进程描述符时，会调用release_task()函数，该函数定义在Linux2.6.34/kernel/exit.c#L168完成以下工作：

调用__exit_signal()函数，该函数会调用_unhash_process()，后者会调用detach_pid()从pidhash上撒谎才能胡该进程，同时也会在任务列表中删除该进程。
_exit_signal()会释放目前僵死进程所使用的所有剩余资源，并进行最终统计和记录
若该进程是线程组的最后一个进程，且领头进程已经死掉，那么release_task()需要通知僵死的领头进程的父进程
release_task()调用put_task_struct()释放进程内核栈和thread_info结构所占的页，并释放task_struct所占的高速缓存

通过上述操作，完成了释放进程描述符和所有进程独享的资源

具体代码如下：

void release_task(struct task_struct * p)
{
    struct task_struct *leader;
    int zap_leader;
repeat:
    tracehook_prepare_release_task(p);
    /* don't need to get the RCU readlock here - the process is dead and
     * can't be modifying its own credentials. But shut RCU-lockdep up */
    rcu_read_lock();
    atomic_dec(&__task_cred(p)->user->processes);
    rcu_read_unlock();

    proc_flush_task(p);

    write_lock_irq(&tasklist_lock);
    tracehook_finish_release_task(p);
    __exit_signal(p);

    /*
     * If we are the last non-leader member of the thread
     * group, and the leader is zombie, then notify the
     * group leader's parent process. (if it wants notification.)
     */
    zap_leader = 0;
    leader = p->group_leader;
    if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
        BUG_ON(task_detached(leader));
        do_notify_parent(leader, leader->exit_signal);
        /*
         * If we were the last child thread and the leader has
         * exited already, and the leader's parent ignores SIGCHLD,
         * then we are the one who should release the leader.
         *
         * do_notify_parent() will have marked it self-reaping in
         * that case.
         */
        zap_leader = task_detached(leader);

        /*
         * This maintains the invariant that release_task()
         * only runs on a task in EXIT_DEAD, just for sanity.
         */
        if (zap_leader)
            leader->exit_state = EXIT_DEAD;
    }

    write_unlock_irq(&tasklist_lock);
    release_thread(p);
    call_rcu(&p->rcu, delayed_put_task_struct);

    p = leader;
    if (unlikely(zap_leader))
        goto repeat;
}