Linux内核学习009——进程管理(五)

进程创建

许多操作系统提供了产生(spawn)进程的机制：首先在新的地址康健创建进程，读入可执行文件，然后开始执行。Linux的进程创建有些不同，它将上述步骤分解到两个单独的函数中去执行：fork()和exec函数族。过程如下：

首先通过fork()函数拷贝当前进程创建一个子进程，子进程与父进程的区别在于PID、PPID(父进程PID)和某些资源和统计量。
接着，使用exec函数族中的一个函数读取并载入可执行文件到地址空间开始执行。

注：exec函数族指的是exec系列函数，其定义在unistd.h头文件中，包括：execve()、fexecve()、execv()、execle()、execl()、execvp()、execlp()、execvpe()。想要查看每个函数的区别和详细用法，可以查询man手册。

/* Replace the current process, executing PATH with arguments ARGV and
   environment ENVP.  ARGV and ENVP are terminated by NULL pointers.  */
extern int execve (__const char *__path, char *__const __argv[],
                   char *__const __envp[]) __THROW __nonnull ((1, 2)); 

#ifdef __USE_XOPEN2K8
/* Execute the file FD refers to, overlaying the running program image.
   ARGV and ENVP are passed to the new program, as for `execve'.  */
extern int fexecve (int __fd, char *__const __argv[], char *__const __envp[])
     __THROW __nonnull ((2));
#endif


/* Execute PATH with arguments ARGV and environment from `environ'.  */
extern int execv (__const char *__path, char *__const __argv[])
     __THROW __nonnull ((1, 2)); 

/* Execute PATH with all arguments after PATH until a NULL pointer,
   and the argument after that for environment.  */
extern int execle (__const char *__path, __const char *__arg, ...) 
     __THROW __nonnull ((1, 2)); 

/* Execute PATH with all arguments after PATH until
   a NULL pointer and environment from `environ'.  */
extern int execl (__const char *__path, __const char *__arg, ...)
     __THROW __nonnull ((1, 2));

/* Execute FILE, searching in the `PATH' environment variable if it contains
   no slashes, with arguments ARGV and environment from `environ'.  */
extern int execvp (__const char *__file, char *__const __argv[])
     __THROW __nonnull ((1, 2));

/* Execute FILE, searching in the `PATH' environment variable if
   it contains no slashes, with all arguments after FILE until a
   NULL pointer and environment from `environ'.  */
extern int execlp (__const char *__file, __const char *__arg, ...)
     __THROW __nonnull ((1, 2));

#ifdef __USE_GNU
/* Execute FILE, searching in the `PATH' environment variable if it contains
   no slashes, with arguments ARGV and environment from `environ'.  */
extern int execvpe (__const char *__file, char *__const __argv[],
                    char *__const __envp[])
     __THROW __nonnull ((1, 2));
#endif

写时复制

传统的fork()系统调用直接把所有的资源复制给新创建的进程，这种实现过于简单且效率低下(比如：新创建的进程立即执行一个新的可执行文件，那么所有的拷贝都将前功尽弃)。因此Linux的fork()采用了写时复制实现。写时复制并不复制整个进程的地址空间，而是让父进程和子进程共享同一份数据，只有到需要写入时才会复制数据。这样当数据不会写入的时候，就节省了复制的开销。因此fork()的实际开销就是复制父进程的页表以及给子进程创建唯一的进程描述符了。

fork()

Linuxt通过clone()系统调用实现了fork()函数(fork等这些库函数都是由glibc提供的)，该调用通过一系列的参数标志来知名父子进程需要共享的资源。fork()、vfork()、__clone()库函数都根据自身所需要的参数标志调用clone()，然后由clone()调用do_fork()函数。

 #define _GNU_SOURCE
#include <sched.h>

int clone(int (*fn)(void *), void *child_stack,
          int flags, void *arg, ...
          /* pid_t *ptid, struct user_desc *tls, pid_t *ctid */ );

do_fork完成了创建中的大部分工作，它的定义在Linux2.6.34/kernel/fork.c#L1351，代码如下：

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          struct pt_regs *regs,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Do some preliminary argument and permissions checking before we
     * actually start allocating stuff
     */
    if (clone_flags & CLONE_NEWUSER) {
        if (clone_flags & CLONE_THREAD)
            return -EINVAL;
        /* hopefully this check will go away when userns support is
         * complete
         */
        if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
                !capable(CAP_SETGID))
            return -EPERM;
    }

    /*
     * We hope to recycle these flags after 2.6.26
     */
    if (unlikely(clone_flags & CLONE_STOPPED)) {
        static int __read_mostly count = 100;

        if (count > 0 && printk_ratelimit()) {
            char comm[TASK_COMM_LEN];

            count--;
            printk(KERN_INFO "fork(): process `%s' used deprecated "
                    "clone flags 0x%lx\n",
                get_task_comm(comm, current),
                clone_flags & CLONE_STOPPED);
        }
    }

    /*
     * When called from kernel_thread, don't do user tracing stuff.
     */
    if (likely(user_mode(regs)))
        trace = tracehook_prepare_clone(clone_flags);

    p = copy_process(clone_flags, stack_start, regs, stack_size,
             child_tidptr, NULL, trace);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;

        trace_sched_process_fork(current, p);

        nr = task_pid_vnr(p);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
        }

        audit_finish_fork(p);
        tracehook_report_clone(regs, clone_flags, nr, p);

        /*
         * We set PF_STARTING at creation in case tracing wants to
         * use this to distinguish a fully live task from one that
         * hasn't gotten to tracehook_report_clone() yet.  Now we
         * clear it and set the child going.
         */
        p->flags &= ~PF_STARTING;

        if (unlikely(clone_flags & CLONE_STOPPED)) {
            /*
             * We'll start up with an immediate SIGSTOP.
             */
            sigaddset(&p->pending.signal, SIGSTOP);
            set_tsk_thread_flag(p, TIF_SIGPENDING);
            __set_task_state(p, TASK_STOPPED);
        } else {
            wake_up_new_task(p, clone_flags);
        }

        tracehook_report_clone_complete(trace, regs,
                        clone_flags, nr, p);

        if (clone_flags & CLONE_VFORK) {
            freezer_do_not_count();
            wait_for_completion(&vfork);
            freezer_count();
            tracehook_report_vfork_done(p, nr);
        }
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}

do_fork()函数会调用copy_process()函数，然后再让进程开始运行。该函数定义在linux2.6.34/kernel/fork.c#L954.

copy_process()函数完成了以下工作：

调用dup_task_struct()为新进程创建一个内核栈、thread_info结构体和task_struct结构体，这些值与当前进程相同，也即父进程和子进程的进程描述符相同。
检查并确保新创建这个子进程后，当前用户所拥有的进程数未超出限制。
修改进程描述符，主要是统计信息，但是大多数成员依然未变。
子进程的状态被设置为TASK_UNINTERRUPTIBLE，以保证其不会运营。
copy_process()调用copy_flags()已更新task_struct的flags成员，表明进程是否拥有超级用户权限的PF_SUPERPRIV标志被清零，表明进程还未调用exec函数族的PF_FORKNOEXEC标志被设置。
调用alloc_pid()为新进程分配一个有效的PID。
根据传递给clone()的参数标志，copy_process()拷贝或共享打开的文件、文件系统信息、信号处理函数、进程地址空间和命名空间等。
最后，copy_process()做扫尾工作并返回一个指向子进程的指针。

再回到do_fork()函数，若copy_process()函数成功返回，则新创建的子进程被唤醒并投入运行。