一、Linux 内核架构图

在这里插入图片描述

二、进程基础知识

Linux 内核把进程称为任务(task)，进程的虚拟地址空间分为用户虚拟地址空间和内核虚拟地址空间，所有进程共享内核虚拟地址空间，每个进程有独立的用户虚拟地址空间。

进程有两种特殊形式：

没有用户虚拟地址空间的进程称为内核线程。
共享用户虚拟地址空间的进程称为用户线程。
通用在不会引起混淆的情况下把用户线程简称为线程。共享同一个用户虚拟地址空间的所有用户线程组成一个线程组。

C 标准库进程术语和 Linux 内核进程术语对应关系如下：

C 标准库进程术语	Linux 内核进程术语
包含多个线程的进程	线程组
只有一个线程的进程	进程或任务
线程	共享用户虚拟地址空间的进程

三、Linux 进程四要素

有一段程序供其执行。
有进程专用的系统堆栈空间。
在内核有 task_struct 数据结构。
有独立的存储空间，拥有专有的用户空间。

四、task_struct 数据结构主要成员

(include/linux/sched.h)

struct task_struct {
    
    //进程描述符
#ifdef CONFIG_THREAD_INFO_IN_TASK
	/*
	 * For reasons of header soup (see current_thread_info()), this
	 * must be the first element of task_struct.
	 */
	struct thread_info		thread_info;
#endif
	unsigned int			__state;//指向进程状态

#ifdef CONFIG_PREEMPT_RT
	/* saved state for "spinlock sleepers" */
	unsigned int			saved_state;
#endif

	/*
	 * This begins the randomizable portion of task_struct. Only
	 * scheduling-critical items should be added above here.
	 */
	randomized_struct_fields_start

	void				*stack;//指向内核栈
	refcount_t			usage;
	/* Per task flags (PF_*), defined further below: */
	unsigned int			flags;
	unsigned int			ptrace;

       // ...... 
};

task_struct：进程描述符。
__state：指向进程状态。
*stack：指向内核栈。
pid：指向全局的进程号。
tgid：指向全局的线程组的标识符。
*real_parent：指向真实的父进程
*parent：指向当前的父进程。比如一个进程被另外的进程使用系统调用进行跟踪（ptrace），那么此时的父进程就是跟踪进程。
进程调度策略的优先级：prio、static_prio、normal_prio、rt_priority。
nr_cpus_allowed：允许进程在哪些处理器上执行。
*mm：指向内存描述符，内核线程此项位NULL。
*active_mm：指向内存描述符，内核线程运行时从进程借用。
*fs：文件系统信息。

还有很多成员，这里就不一一列举。

五、创建新进程分析

在 Linux 内核中，新进程是从一个已经存在的进程复制出来的，内核使用静态数据结构造出 0 号内核线程，0 号内核线程分叉生成 1 号内核线程和 2 号内核线程（kthreadd 线程）。1 号内核线程完成初始化以后装载用户程序，变成 1 号进程，其他进程都是 1 号进程或者它的子孙进程分叉生成的；其他内核线程是 kthreadd 线程分叉生成的。

Linux 3 个系统调用创建新的进程：

fork(分叉)：子进程是父进程的一个副本，采用写时复制技术。
vfork：用于创建子进程，之后子进程立即调用 execve 以装载新程序的情况，为了避免复制物理页，父进程会睡眠等待子进程装载新程序。现在 fork 采用了写时复制技术，vfork 失去了速度优势，已经被废弃。
clone（克隆）：可以精确地控制子进程和父进程共享哪些资源。这个系统调用的主要用处是可供 pthread 库用来创建线程。

clone 是功能最齐全的函数，参数多、使用复杂，fork 是 clone 的简化函数。
（kernel/fork.c）

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
    
    
#ifdef CONFIG_MMU
	struct kernel_clone_args args = {
    
    
		.exit_signal = SIGCHLD,
	};

	return _do_fork(&args);
#else
	/* can not support in nommu mode */
	return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
    
    
	struct kernel_clone_args args = {
    
    
		.flags		= CLONE_VFORK | CLONE_VM,
		.exit_signal	= SIGCHLD,
	};

	return _do_fork(&args);
}
#endif


#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
		 int __user *, parent_tidptr,
		 unsigned long, tls,
		 int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
		 int __user *, parent_tidptr,
		 int __user *, child_tidptr,
		 unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
		int, stack_size,
		int __user *, parent_tidptr,
		int __user *, child_tidptr,
		unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
		 int __user *, parent_tidptr,
		 int __user *, child_tidptr,
		 unsigned long, tls)
#endif
{
    
    
	struct kernel_clone_args args = {
    
    
		.flags		= (lower_32_bits(clone_flags) & ~CSIGNAL),
		.pidfd		= parent_tidptr,
		.child_tid	= child_tidptr,
		.parent_tid	= parent_tidptr,
		.exit_signal	= (lower_32_bits(clone_flags) & CSIGNAL),
		.stack		= newsp,
		.tls		= tls,
	};

	if (!legacy_clone_args_valid(&args))
		return -EINVAL;

	return _do_fork(&args);
}
#endif

Linux 内核定义系统调用的独特方式，目前以系统调用 fork 为例：创建新进程的 3 个系统调用在文件kernel/fork.c中，它们把工作委托给函数_do_fork（从6.0开始，更名为kernel_clone）。具体源码分析如下：

long _do_fork(struct kernel_clone_args *args)
{
    
    
	u64 clone_flags = args->flags;
	struct completion vfork;
	struct pid *pid;
	struct task_struct *p;
	int trace = 0;
	long nr;

// ......

}

Linux 内核函数_do_fork()执行流程如下图所示：
在这里插入图片描述
具体核心处理函数为 copy_process()内核源码如下：

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
static __latent_entropy struct task_struct *copy_process(
					struct pid *pid,
					int trace,
					int node,
					struct kernel_clone_args *args)
{
    
    
	int pidfd = -1, retval;
	struct task_struct *p;
	struct multiprocess_signals delayed;
	struct file *pidfile = NULL;
	u64 clone_flags = args->flags;
	struct nsproxy *nsp = current->nsproxy;

// ......

}