13 虚拟文件系统
虚拟文件系统(VFS)作为内核的子系统,为用户空间程序提供了文件和文件系统相关的接口。通过VFS,程序可以利用标准的Unix系统调用对不同的文件系统,甚至不同介质上的文件系统进行读写操作。
13.1 通用文件系统接口
open()、read()、write()。
13.2 文件系统抽象层
VFS抽象层之所以能衔接各种各样的文件系统,是因为它定义了所有文件系统都支持的、基本的、概念上的接口和数据结构。同时实际文件系统也将自身的诸如“如何打开文件”,“目录是什么”等概念在形式上与VFS的定义保持一致。
13.3 Unix文件系统
四个概念:文件、目录项、索引节点和安装点。File、dentry、inode、mount point
13.4 VFS对象及其数据结构
VFS中有四个主要的对象类型,它们分别是:
- 超级块对象(super_block),代表一个具体的已安装文件系统。
- 索引节点对象(inode),代表一个具体文件。
- 目录项对象(dentry),代表一个目录项,是路径的一个组成部分。
- 文件对象(file),代表由进程打开的文件。
每个对象中都包含一个操作对象,这些操作对象描述了内核针对主要对象可以使用的方法:
- super_operation,其中包括内核针对特定文件系统所能调用的方法,比如write_inode()和sync_fs()等。
- inode_operation,其中包括内核针对特定文件所能调用的方法,比如create()和link()等。
- dentry_operation,其中包括内核针对特定目录所能调用的方法,比如d_compare()和d_delete()等。
- file_operation,其中包括进程针对已打开文件所能调用的方法,比如read()和write()等。
还有很多其他对象,如:
file_system_type:表示每个注册的文件系统,描述文件系统及其性能。
vfsmount:表示每一个安装点,包含安装点的相关信息,如位置和安装标志等。
两个与进程相关的结构体:
fs_struct:
file:
13.5 超级块对象(super_block)
对应于存放在磁盘特定扇区中的文件系统超级块。对于非基于磁盘的文件系统(如基于内存的文件系统,比如sysfs),它们会在使用现场创建超级块并将其保存到内存中。
struct super_block { struct list_head s_list; 指向所有超级块链表/* Keep this first */ dev_t s_dev; 设备标示符/* search index; _not_ kdev_t */ unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif const struct xattr_handler **s_xattr;
struct list_head s_inodes; /* all inodes */ struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */
/* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; struct list_head s_inode_lru; /* unused inode lru */ int s_nr_inodes_unused; /* # of inodes on lru */
struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; struct quota_info s_dquot; /* Diskquota specific options */
struct sb_writers s_writers;
char s_id[32]; /* Informational name */ u8 s_uuid[16]; /* UUID */
void *s_fs_info; /* Filesystem private info */ unsigned int s_max_links; fmode_t s_mode;
/* Granularity of c/m/atime in ns. Cannot be worse than a second */ u32 s_time_gran;
/* * The next field is for VFS *only*. No filesystems have any business * even looking at it. You had been warned. */ struct mutex s_vfs_rename_mutex; /* Kludge */
/* * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype" */ char *s_subtype;
/* * Saved mount options for lazy filesystems using * generic_show_options() */ char __rcu *s_options; const struct dentry_operations *s_d_op; /* default d_op for dentries */
/* * Saved pool identifier for cleancache (-1 means none) */ int cleancache_poolid;
struct shrinker s_shrink; /* per-sb shrinker handle */
/* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count;
/* Being remounted read-only */ int s_readonly_remount;
/* * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; } |
超级块对象通过alloc_super()函数创建并初始化。在文件系统安装时,文件系统会调用alloc_super()以便从磁盘读取文件系统超级块,并且将其信息填充到内存中的超级块对象中。
13.6 超级块操作
超级块对象中最重要的一个域是s_op,它指向超级块的操作函数表。
struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb);创建和初始化一个新的inode void (*destroy_inode)(struct inode *); 释放给定的inode
void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc);将给定的索引节点写入磁盘 int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *);卸载文件系统时用来释放超级块 int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_fs) (struct super_block *); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); int (*nr_cached_objects)(struct super_block *); void (*free_cached_objects)(struct super_block *, int); } |
超级块操作函数执行文件系统和索引节点的底层操作。当文件系统需要对其超级块执行操作时,首先要在超级块对象中寻找需要的操作方法,比如一个文件系统要写自己的超级块,需要调用:sb->s_op->write_super(sb)。
13.7 索引节点对象
索引节点对象包含了内核在操作文件或目录时需要的全部信息。对于Unix风格的文件系统来说,这些信息可以从磁盘索引节点直接读入。
struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags;
#ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif
const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping;
#ifdef CONFIG_SECURITY void *i_security; #endif
/* Stat data, not accessed from path walking */ unsigned long i_ino; 节点号 /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; 设备号 loff_t i_size; struct timespec i_atime; struct timespec i_mtime; struct timespec i_ctime; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif
/* Misc */ unsigned long i_state; struct mutex i_mutex;
unsigned long dirtied_when; /* jiffies of first dirtying */
struct hlist_node i_hash; struct list_head i_wb_list; /* backing dev IO list */ struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; u64 i_version; atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock *i_flock; struct address_space i_data; #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif struct list_head i_devices; union { struct pipe_inode_info *i_pipe; struct block_device *i_bdev; struct cdev *i_cdev; 字符设备驱动 };
__u32 i_generation;
#ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ struct hlist_head i_fsnotify_marks; #endif
#ifdef CONFIG_IMA atomic_t i_readcount; /* struct files open RO */ #endif void *i_private; /* fs or device private pointer */ } |
一个索引节点代表文件系统中的一个文件(也可以是设备或管道这样的特殊文件),索引节点仅当文件被访问时,才在内存中创建。
13.8 索引节点操作
描述了VFS用以操作索引节点对象的所有方法。
struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); void * (*follow_link) (struct dentry *, struct nameidata *); int (*permission) (struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *);
int (*create) (struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *);创建硬连接 int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *);创建符号连接 int (*mkdir) (struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);被系统调用mknod()调用,创建特殊文件 int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*removexattr) (struct dentry *, const char *); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*dentry_open)(struct dentry *, struct file *, const struct cred *); } |
13.9 目录项对象
struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
/* Ref lookup also touches following */ unsigned int d_count; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */
struct list_head d_lru; /* LRU list */ struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ /* * d_alias and d_rcu can share memory */ union { struct hlist_node d_alias; /* inode alias list */ struct rcu_head d_rcu; } d_u; } |
与前面的两个对象不同,目录项对象没有对应的磁盘数据结构,VFS根据字符串形式的路径名现场创建它。而且由于目录项对象并非真正保存在磁盘上,所以目录项结构体没有是否被修改的标志。
13.9.1 目录项状态
三种状态:被使用、未被使用和负状态。
一个被使用的目录项对应一个有效的inode(d_inode),而且d_count为正值。
13.9.2 目录项缓存dcache
内核将目录项对象缓存在目录项缓存中。
13.10 目录项操作
struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, const struct inode *, struct qstr *); int (*d_compare)(const struct dentry *, const struct inode *, const struct dentry *, const struct inode *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool); } ____cacheline_aligned; |
13.11 文件对象
文件对象表示进程已打开的文件。Struct file由相应的open()系统调用创建,由close系统调用撤销。因为多个进程可以同时打开和操作同一个文件,所以同一个文件也可能存在多个对应的文件对象。文件对象仅仅在进程观点上代表已打开的文件,它反过来指向目录项对象(目录项对象反过来指向索引节点),其实只有目录项对象才表示已打开的世纪文件。虽然一个文件对应的文件对象不是唯一的,但对应的索引节点和目录项对象无疑是唯一的。
file->f_path
dentry
d_inode
struct file { union { struct llist_node fu_llist; struct rcu_head fu_rcuhead; } f_u; struct path f_path; #define f_dentry f_path.dentry struct inode *f_inode; /* cached value */ const struct file_operations *f_op;
/* * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. * Must not be taken from IRQ context. */ spinlock_t f_lock; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra;
u64 f_version; #ifdef CONFIG_SECURITY void *f_security; #endif /* needed for tty driver, and maybe others */ void *private_data;
#ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; #ifdef CONFIG_DEBUG_WRITECOUNT unsigned long f_mnt_write_state; #endif } |
13.12 文件操作
struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); int (*readdir) (struct file *, void *, filldir_t); unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); int (*show_fdinfo)(struct seq_file *m, struct file *f); } |
具体的文件系统可以为每一种操作做专门的实现,或者如果存在通用操作,也可以使用通用操作。
13.13 和文件系统相关的数据结构
file_system_type --->get_sb() |
描述各种特定文件系统类型,如ext3、ext4。每种文件系统,不管有多少个实例安装到系统中,还是根本就没有安装到系统中,都只有一个file_system_type结构。 |
vfsmount |
描述一个安装文件系统的实例,代表一个安装点 |
13.14 和进程相关的数据结构
有三个数据结构将VFS层和系统的进程紧密联系在一起,分别是file_struct,fs_struct,和namespace。
file_struct:task_struct->files.所有与单个进程相关的信息(如打开的文件及文件描述符)都包含在其中。
/* * Open file table structure */ struct files_struct { /* * read mostly part */ atomic_t count; struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP */ spinlock_t file_lock ____cacheline_aligned_in_smp; int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT];指向已打开的文件对象,NR_OPEN_DEFAULT为64,如果一个进程打开的文件对象超过64个,内核将分配一个新数组 }; |
fs_struct:task_struct->fs。它包含文件系统和进程相关的信息。该结构包含了当前进程工作目录(pwd)和根目录。
struct fs_struct { int users;用户数目 spinlock_t lock; seqcount_t seq; int umask;掩码 int in_exec;当前正在执行的文件 struct path root, pwd;根目录路径、当前路径 };
|
namespace:.......