Linux文件系统

Linux文件系统（三）虚拟文件系统

文章目录

Linux文件系统（三）虚拟文件系统

一、文件系统框架
二、挂载文件系统
三、打开文件
四、总结

一、文件系统框架

文件系统需要许多层的组件一起协作，具体怎么协作，下面来看一看

在这里插入图片描述

在应用层，进程操作文件可以通过 open、read、write 等系统调用
在内核，每个进程都需要对其打开的文件，维护一定的数据结构
在内核，整个系统打开的文件，也需要维护一定的数据结构
Linux可以支持多大数十种文件系统格式，每种文件系统的实现各不相同，为了统一操作，Linux提供的虚拟文件系统这一接口。它提供了常见的文件系统对象模型，例如 inode、directory entry、mount等，以及这些对象的操作方法，例如 inode operations、directory operations、file operations 等
然后对接的是真正的文件系统，例如 ext4
为了读取 ext4 文件系统，要通过块设备 I/O 层，也即 BIO 层。这是文件系统层和块设备驱动层的接口
为了加快读写速度，还有一个缓存层
最下面的就是块设备驱动程序了

通过系统调用分析内核架构是一种很好的方式，下面将分析这两个系统调用

mount：挂载文件系统
open：打开文件，如果文件不存在并且指定了 O_CREAT，那么就创建一个文件

二、挂载文件系统

想要操作文件系统，第一步就是挂载文件系统

内核是否支持某种文件系统，那要看内核是否已经注册了某种文件系统。例如 ext4 文件系统，就需要通过 register_filesystem 进行注册，传入的参数是 ext4_fs_type，表示注册的是 ext4 文件系统类型。这里面有一个重要的成员变量 ext4_mount，请记住它，后面会再谈到

register_filesystem(&ext4_fs_type);


static struct file_system_type ext4_fs_type = {
	.owner		= THIS_MODULE,
	.name		= "ext4",
	.mount		= ext4_mount,
	.kill_sb	= kill_block_super,
	.fs_flags	= FS_REQUIRES_DEV,
};

挂载文件系统通过系统调用 sys_mount 来操作，定义如下

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data)
{
......
	ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
......
}

接下来的调用链为：do_mount -> do_new_mount -> vfs_kern_mount

struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
......
	mnt = alloc_vfsmnt(name);
......
	root = mount_fs(type, flags, name, data);
......
	mnt->mnt.mnt_root = root;
	mnt->mnt.mnt_sb = root->d_sb;
	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
	mnt->mnt_parent = mnt;
	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
	return &mnt->mnt;
}

首先创建一个 struct mount 结构，每一个被挂载的文件系统在内核中都对应这样一个结构，定义如下

struct mount {
	struct hlist_node mnt_hash;
	struct mount *mnt_parent;
	struct dentry *mnt_mountpoint;
	struct vfsmount mnt;
	union {
		struct rcu_head mnt_rcu;
		struct llist_node mnt_llist;
	};
	struct list_head mnt_mounts;	/* list of children, anchored here */
	struct list_head mnt_child;	/* and going through their mnt_child */
	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
	struct list_head mnt_list;
......
} __randomize_layout;


struct vfsmount {
	struct dentry *mnt_root;	/* root of the mounted tree */
	struct super_block *mnt_sb;	/* pointer to superblock */
	int mnt_flags;
} __randomize_layout;

mnt_parent：挂载点所在的文件系统对应的 struct mount
mnt_mountpoint：挂载点在父文件系统的 dentry

struct dentry 表示目录项对象，并于对应的目录或文件的inode关联
mnt_root：当前文件系统根目录的dentry
mnt_sb：指向超级块

接下来调用 mount_fs 挂载文件系统

struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
	struct dentry *root;
	struct super_block *sb;
......
	root = type->mount(type, flags, name, data);
......
	sb = root->d_sb;
......
}

这里调用了 type->mount，如果是ext4，那么就对应上面提到的 ext4_mount，通过这个函数，从文件系统中获得超级块。在文件系统的实现中，每个在磁盘中的结构，在内存中也对应相同形式的结构，当所有的数据结构读到内存中，内核就可以通过操作这些数据结构，来操作文件系统了

下面举一个例子，来解析 mount 之后，各数据结构之间的关系

假设根文件系统下面有一个目录 home，然后将另一个文件系统A挂载到根文件系统的 home 目录下面。在文件系统A的根目录下面，有一个文件夹 hello，所以就有了 /home/hello 目录。然后将文件系统B挂载到 /home/hello 目录下，而文件系统B根目录下面有一个文件夹 work，work 下面有一个文件夹 data，所以就有了目录 /home/hello/work/data

为了维护这些关系，内核创建并维护了这些数据结构，如下图

在这里插入图片描述

黄色部分为 struct mount，每个被挂载的文件系统在内核中都对应一个 struct mount

绿色部分为 struct file，每个打开的文件都有一个 struct file，里面有两个变量，一个指向对应的 struct mount，一个指向对应的 struct dentry

红色部分为 struct dentry，叫做目录项对象，每个目录或者文件都对应一个 struct dentry，用于与 inode 相关联

三、打开文件

打开文件通过系统调用 sys_open，定义如下

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
......
	return do_sys_open(AT_FDCWD, filename, flags, mode);
}


long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
......
	fd = get_unused_fd_flags(flags);
	if (fd >= 0) {
		struct file *f = do_filp_open(dfd, tmp, &op);
		if (IS_ERR(f)) {
			put_unused_fd(fd);
			fd = PTR_ERR(f);
		} else {
			fsnotify_open(f);
			fd_install(fd, f);
		}
	}
	putname(tmp);
	return fd;
}

要打开一个文件，首先要通过 get_unused_fd_flags 获取一个没有用的文件描述符，如何获取没有用的文件描述符呢？

每个进程 task_struct 都有一个 files_struct

struct files_struct		*files;

files_struct 里面有一个文件描述符数组

struct files_struct {
......
	struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

每打开一个文件，就会在这个数组中找到第一个未被使用的项，下标就是文件描述符

对于一个进程，默认的文件描述符0就是 stdin，文件描述符1就是 stdout，文件描述符2就是 stderr

文件描述符数组中的每一项都是一个 struct file 指针，也就是说，每一个打开的文件都对应一个 struct file

do_sys_open 中调用 do_filp_open，就是创建这个 struct file 结构，然后 fd_install(fd, f) 是将文件描述符和这个结构体关联起来

struct file *do_filp_open(int dfd, struct filename *pathname,
		const struct open_flags *op)
{
......
	set_nameidata(&nd, dfd, pathname);
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
......
	restore_nameidata();
	return filp;
}

do_filp_open 首先调用 set_nameidata 解析路径，我们知道文件就是一串的路径名称，需要逐个解析。这个结构就是解析和查找路径的时候做辅助作用

在 struct nameidata 有一个关键的成员变量 struct path

struct path {
	struct vfsmount *mnt;
	struct dentry *dentry;
} __randomize_layout;

其中的 struct vfs_mount 与文件系统的挂载有关。另一个是 struct dentry，可以用来表示目录和文件，建立文件名和 inode 之间的关联

接下来是 path_openat，主要做以下几件事

static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
{
......
	file = get_empty_filp();
......
	s = path_init(nd, flags);
......
	while (!(error = link_path_walk(s, nd)) &&
		(error = do_last(nd, file, op, &opened)) > 0) {
......
	}
	terminate_walk(nd);
......
	return file;
}

get_empty_filp：生成一个 struct file 结构
path_init：初始化 nameidata，准备开始节点路径查找
link_path_walk：对于路径名逐层进行文件节点的查找
do_last：获取文件对应的 inode 对象，并且初始化 file 对象

例如，文件 /root/hello/work/data，link_path_walk 会解析前面的路径部分 /root/hello/work，解析完毕后，nameidata 的 dentry 就是路径最后的上一级目录对应的 dentry，这里是 work，而 nameidata 的filename 为路径的最后一部分 data，解析完后之后，交给 do_last 做下一步处理

do_last 的定义如下

static int do_last(struct nameidata *nd,
		   struct file *file, const struct open_flags *op,
		   int *opened)
{
......
	error = lookup_fast(nd, &path, &inode, &seq);
......
    error = lookup_open(nd, &path, file, op, got_write, opened);
......
	error = vfs_open(&nd->path, file, current_cred());
......
}

在这里面，我们需要先查找文件路径最后一部分对应的 dentry，如何查找呢？

Linux 为了提高目录项对象的处理效率，实现了目录项高速缓存 dentry cache，简称 dcache，它主要由两个数据结构组成

哈希表 dentry_hashtable：dcache 中的所有 dentry 对象都通过 d_hash 指针链到相应的 dentry 哈希表中
未使用的 dentry 对象链表 s_dentry_lru：dentry 对象通过其 d_lru 指针链入 LRU 链表中。LRU 的意思是最近最少使用

这两个列表会产生复杂的关系：

引用次数为0：一个散列表的 dentry 没有被引用，就会加到LRU链表中
再次被引用：一个在LRU链表中的 dentry 再次被引用了，则从LRU链表中移除
分配：当 dentry 在散列表中没有找到，则从 Slub 分配器中分配
过期归还：当LRU链表中最长时间没有被使用的 dentry 应该释放回 Slub 分配器
文件删除：文件被删除了，相应的 dentry 应该释放回 Slub 分配器
结构复用：当需要分配一个 dentry，但是无法分配新的，就从LRU链表中取出一个来复用

所以，do_last 在查找 dentry 的时候，会先从缓存中查找，调用的是 lookup_fast

如果缓存中没有找到，那么就需要到文件系统中查找，通过 lookup_open 实现，定义如下

static int lookup_open(struct nameidata *nd, struct path *path,
			struct file *file,
			const struct open_flags *op,
			bool got_write, int *opened)
{
    ......
    dentry = d_alloc_parallel(dir, &nd->last, &wq);
    ......
    struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
							     nd->flags);
    ......
    path->dentry = dentry;
	path->mnt = nd->path.mnt;
}




const struct inode_operations ext4_dir_inode_operations = {
	.create		= ext4_create,
	.lookup		= ext4_lookup,
...

lookup_open 会创建一个新的 dentry，然后调用上一级目录的 inode 的 inode_operations 的 lookup 函数，对于 ext4 文件系统来说，调用的是 ext4_lookup，会到文件系统中去寻找 inode，找到后将新生成的 dentry 赋给 path 变量

do_last 最后一步调用的是 vfs_open

int vfs_open(const struct path *path, struct file *file,
	     const struct cred *cred)
{
	struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0);
......
	file->f_path = *path;
	return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
}


static int do_dentry_open(struct file *f,
			  struct inode *inode,
			  int (*open)(struct inode *, struct file *),
			  const struct cred *cred)
{
......
	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
				FMODE_PREAD | FMODE_PWRITE;
	path_get(&f->f_path);
	f->f_inode = inode;
	f->f_mapping = inode->i_mapping;
......
	f->f_op = fops_get(inode->i_fop);
......
	open = f->f_op->open;
......
	error = open(inode, f);
......
	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
	return 0;
......
}


const struct file_operations ext4_file_operations = {
......
	.open		= ext4_file_open,
......
};

vfs_open 最重要的一件事就是，调用 f_open->open，也就是 ext4_file_open。另外还有一件重要的事情就是将打开文件的所有信息天道 struct file 结构中

struct file {
	union {
		struct llist_node	fu_llist;
		struct rcu_head 	fu_rcuhead;
	} f_u;
	struct path		f_path;
	struct inode		*f_inode;	/* cached value */
	const struct file_operations	*f_op;
	spinlock_t		f_lock;
	enum rw_hint		f_write_hint;
	atomic_long_t		f_count;
	unsigned int 		f_flags;
	fmode_t			f_mode;
	struct mutex		f_pos_lock;
	loff_t			f_pos;
	struct fown_struct	f_owner;
	const struct cred	*f_cred;
......
	struct address_space	*f_mapping;
	errseq_t		f_wb_err;
}