linux伪文件系统proc

这次的篇幅稍短一些。
伪文件系统存在于内存中，通常不占用硬盘空间，它以文件的形式，向用户提供了访问系统内核数据的接口。用户和应用程序可以通过访问这些数据接口，得到系统的信息，而且内核允许用户修改内核的某些参数。由于这些文件系统没有写盘，也叫作无持久存储文件系统。
这里介绍一个常见的伪文件系统proc。
proc文件系统(process filesystem)，它使得内核可以生成与系统状态和配置有关的信息，该信息可以由用户和系统程序从普通文件读取，而无需专门的工具与内核通信。在某些情况下，一个简单的cat命令就足够了。数据不仅可以从内核读取，还可以通过向proc文件系统的文件写入字符串，来向内核发送数据。

使用方法

在/proc目录下
在这里插入图片描述
这一个个以名字为数字的目录，里面包含了相应pid的进程信息

# N是相应的pid
cat /proc/N/stat  # 查看进程的状态
cat /proc/N/statm  # 查看进程使用的内存的状态
cat /proc/N/status  # 查看进程状态信息，比stat/statm更具可读性
1
2
3
4

除此之外，还有一些系统信息

cat /proc/interrupt  # 查看系统所有的软中断
cat /proc/kallsyms  # 内核里所有的系统调用接口
cat /proc/net  # 查看网卡设备信息
cat /proc/scsi scsi  # 查看设备信息
cat /proc/tty tty  # 查看设备信息
cat /proc/net/dev  # 查看显示网络适配器及统计信息
cat /proc/vmstat  # 查看虚拟内存统计信息
1
2
3
4
5
6
7

举一个完整的例子，查看用户可以分配的文件句柄的最大数目。

cat /proc/sys/fs/file-max
>4096
echo 8192 > /proc/sys/fs/file-max
cat /proc/sys/fs/file-max
>8192  
1
2
3
4
5

可以看出，这些系统设置也可以通过修改/proc中相应文件来设置。

代码实现

数据结构

与文件系统相关的数据结构

实现一个文件系统，主要扣住七个结构体。

file_system_type：mount -t proc proc /mnt中第一个proc就是它的名字。把文件系统加入内核，首先要准备的就是这个结构体，注册方法是register_filesystem()。
super_block：每mount一次就有一个对应的超级块，是文件系统的根，很多东西都依赖它。
inode/inode_operations与file/file_operations：这一下就是4个结构体了。区分一下file与inode。file更倾向于文件里存储的内容，inode更强调的是在磁盘中存储的元数据。可理解为file就是用户能看到的文件，inode是元数据，结构体本身就含有struct file_operations*。
dentry：关于路径的结构体。

由于之前没有贴过inode_operations和file_operations的代码，这里贴一下，也方便对比。

struct file_operations {
	struct module *owner;
	loff_t (*llseek) (struct file *, loff_t, int);
	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
	int (*iterate) (struct file *, struct dir_context *);
	int (*iterate_shared) (struct file *, struct dir_context *);
	unsigned int (*poll) (struct file *, struct poll_table_struct *);
	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
	int (*mmap) (struct file *, struct vm_area_struct *);
	int (*open) (struct inode *, struct file *);
	int (*flush) (struct file *, fl_owner_t id);
	int (*release) (struct inode *, struct file *);
	int (*fsync) (struct file *, loff_t, loff_t, int datasync);
	int (*fasync) (int, struct file *, int);
	int (*lock) (struct file *, int, struct file_lock *);
	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
	int (*check_flags)(int);
	int (*flock) (struct file *, int, struct file_lock *);
	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
	int (*setlease)(struct file *, long, struct file_lock **, void **);
	long (*fallocate)(struct file *file, int mode, loff_t offset,
			  loff_t len);
	void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
	unsigned (*mmap_capabilities)(struct file *);
#endif
	ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
			loff_t, size_t, unsigned int);
	int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
			u64);
	ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
			u64);
};

struct inode_operations {
	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
	const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
	int (*permission) (struct inode *, int);
	struct posix_acl * (*get_acl)(struct inode *, int);

	int (*readlink) (struct dentry *, char __user *,int);

	int (*create) (struct inode *,struct dentry *, umode_t, bool);
	int (*link) (struct dentry *,struct inode *,struct dentry *);
	int (*unlink) (struct inode *,struct dentry *);
	int (*symlink) (struct inode *,struct dentry *,const char *);
	int (*mkdir) (struct inode *,struct dentry *,umode_t);
	int (*rmdir) (struct inode *,struct dentry *);
	int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
	int (*rename) (struct inode *, struct dentry *,
			struct inode *, struct dentry *, unsigned int);
	int (*setattr) (struct dentry *, struct iattr *);
	int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
	ssize_t (*listxattr) (struct dentry *, char *, size_t);
	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
		      u64 len);
	int (*update_time)(struct inode *, struct timespec *, int);
	int (*atomic_open)(struct inode *, struct dentry *,
			   struct file *, unsigned open_flag,
			   umode_t create_mode, int *opened);
	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
	int (*set_acl)(struct inode *, struct posix_acl *, int);
} ____cacheline_aligned;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

可以看出，file_operations针对的是read、write等对于文件内容本身的操作，而inode_operations针对的是对文件属性的操作。

proc数据结构

再看两个在proc中出现的数据结构。
proc文件系统中的每个数据项都由proc_dir_entry 的一个实例描述

/*
 * This is not completely implemented yet. The idea is to
 * create an in-memory tree (like the actual /proc filesystem
 * tree) of these proc_dir_entries, so that we can dynamically
 * add new files to /proc.
 *
 * parent/subdir are used for the directory structure (every /proc file has a
 * parent, but "subdir" is empty for all non-directory entries).
 * subdir_node is used to build the rb tree "subdir" of the parent.
 */
struct proc_dir_entry {
	unsigned int low_ino;
	umode_t mode;  //文件访问权限
	nlink_t nlink;  //指定了目录中子目录和符号链接的数量
	kuid_t uid;
	kgid_t gid;
	loff_t size;  //文件数据长度
	const struct inode_operations *proc_iops;  //inode操作函数
	const struct file_operations *proc_fops;  //文件操作函数
	struct proc_dir_entry *parent;  //父目录结构体指针
	struct rb_root subdir;
	struct rb_node subdir_node;
	void *data;
	atomic_t count;		/* use count */
	atomic_t in_use;	/* number of callers into module in progress; */
			/* negative -> it's going away RSN */
	struct completion *pde_unload_completion;
	struct list_head pde_openers;	/* who did ->open, but not ->release */
	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
	u8 namelen;  //指定文件名的长度
	char name[];  //存储文件名的字符串
};
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

内核提供了一个数据结构，称之为 proc_inode ，支持以面向 inode 的方式来查看 proc 文件系统的数据项。

//将proc的数据与VFS层的inode数据关联起来,
struct proc_inode {
	struct pid *pid;
	unsigned int fd;
	union proc_op op;
	struct proc_dir_entry *pde;  //pde是一个指针，指向关联到proc数据项的proc_dir_entry实例
	struct ctl_table_header *sysctl;
	struct ctl_table *sysctl_entry;
	struct hlist_node sysctl_inodes;
	const struct proc_ns_operations *ns_ops;
	struct inode vfs_inode;
} ;
1
2
3
4
5
6
7
8
9
10
11
12

其中，联合体proc_op

union proc_op {
	int (*proc_get_link)(struct dentry *, struct path *);
	int (*proc_show)(struct seq_file *m,
		struct pid_namespace *ns, struct pid *pid,
		struct task_struct *task);
};
1
2
3
4
5
6

执行流程

初始化流程

linux启动，会调用start_kernel，该函数会执行proc_root_init()函数。

void __init proc_root_init(void)
{
	int err;

	proc_init_inodecache();  //分配proc的inode缓存
	set_proc_pid_nlink();
	err = register_filesystem(&proc_fs_type);  //向linux内核注册proc文件系统
	if (err)
		return;

	proc_self_init();  //为self分配索引节点号
	proc_thread_self_init();  //设置/proc/thread-self目录，其中包含有关当前线程的信息
	proc_symlink("mounts", NULL, "self/mounts");  //包含调用的挂载点

	proc_net_init();

    //创建相关proc文件系统的信息条目
#ifdef CONFIG_SYSVIPC
	proc_mkdir("sysvipc", NULL);
#endif
	proc_mkdir("fs", NULL);
	proc_mkdir("driver", NULL);
	proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
	/* just give it a mountpoint */
	proc_create_mount_point("openprom");
#endif
	proc_tty_init();
	proc_mkdir("bus", NULL);
	proc_sys_init();  //创建/proc/sys目录并初始化sysctl。
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

在register_filesystem函数中，遍历的链表头是全局变量。参数proc_fs_type结构体

static struct file_system_type proc_fs_type = {
	.name		= "proc",
	.mount		= proc_mount,
	.kill_sb	= proc_kill_sb,  //其实就是umount操作
	.fs_flags	= FS_USERNS_MOUNT,
};
1
2
3
4
5
6

其中proc_mount()函数

static struct dentry *proc_mount(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data)
{
	struct pid_namespace *ns;

	if (flags & MS_KERNMOUNT) {
		ns = data;
		data = NULL;
	} else {
		ns = task_active_pid_ns(current);
	}

    //内核实现了很多种mount方法，这是其中一个
	return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_uper);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

该函数主要是调用proc_fill_super()函数填充super_block。

int proc_fill_super(struct super_block *s, void *data, int silent)
{
	struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
	struct inode *root_inode;
	int ret;

	if (!proc_parse_options(data, ns))
		return -EINVAL;

	/* User space would break if executables or devices appear on proc */
	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
	s->s_blocksize = 1024;
	s->s_blocksize_bits = 10;
	s->s_magic = PROC_SUPER_MAGIC;
	s->s_op = &proc_sops;
	s->s_time_gran = 1;

	/*
	 * procfs isn't actually a stacking filesystem; however, there is
	 * too much magic going on inside it to permit stacking things on
	 * top of it
	 */
	s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
	
	pde_get(&proc_root);
	root_inode = proc_get_inode(s, &proc_root);
	if (!root_inode) {
		pr_err("proc_fill_super: get root inode failed\n");
		return -ENOMEM;
	}

	s->s_root = d_make_root(root_inode);
	if (!s->s_root) {
		pr_err("proc_fill_super: allocate dentry failed\n");
		return -ENOMEM;
	}

	ret = proc_setup_self(s);
	if (ret) {
		return ret;
	}
	return proc_setup_thread_self(s);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

插个话，这里super_block是在挂载时产生的，而不是在insmod时产生的，为什么呢？因为多个设备可能使用同一种文件系统。mount有可能执行多次，这样就会有多个super_block，如果在insmod时产生，就只有一个。可以这么理解，super_block相对于文件系统，就相当于成员变量相当于结构体，而不是静态变量。
这个函数是在挂载时用到的。

挂载流程

当执行mount命令时，会执行系统调用

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
		char __user *, type, unsigned long, flags, void __user *, data)
{
	int ret;
	char *kernel_type;
	char *kernel_dev;
	void *options;

	kernel_type = copy_mount_string(type);
	ret = PTR_ERR(kernel_type);
	if (IS_ERR(kernel_type))
		goto out_type;

	kernel_dev = copy_mount_string(dev_name);
	ret = PTR_ERR(kernel_dev);
	if (IS_ERR(kernel_dev))
		goto out_dev;

	options = copy_mount_options(data);
	ret = PTR_ERR(options);
	if (IS_ERR(options))
		goto out_data;

	ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

	kfree(options);
out_data:
	kfree(kernel_dev);
out_dev:
	kfree(kernel_type);
out_type:
	return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

跟do_mount()函数

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
long do_mount(const char *dev_name, const char __user *dir_name,
		const char *type_page, unsigned long flags, void *data_page)
{
	struct path path;
	int retval = 0;
	int mnt_flags = 0;

	/* Discard magic */
	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
		flags &= ~MS_MGC_MSK;

	/* Basic sanity checks */
	if (data_page)
		((char *)data_page)[PAGE_SIZE - 1] = 0;

	/* ... and get the mountpoint */
	retval = user_path(dir_name, &path);
	if (retval)
		return retval;

//检查对应超级块是否安全
	retval = security_sb_mount(dev_name, &path,
				   type_page, flags, data_page);
	if (!retval && !may_mount())
		retval = -EPERM;
	if (!retval && (flags & MS_MANDLOCK) && !may_mandlock())
		retval = -EPERM;
	if (retval)
		goto dput_out;

	/* Default to relatime unless overriden */
	if (!(flags & MS_NOATIME))
		mnt_flags |= MNT_RELATIME;

	/* Separate the per-mountpoint flags */
	if (flags & MS_NOSUID)
		mnt_flags |= MNT_NOSUID;
	if (flags & MS_NODEV)
		mnt_flags |= MNT_NODEV;
	if (flags & MS_NOEXEC)
		mnt_flags |= MNT_NOEXEC;
	if (flags & MS_NOATIME)
		mnt_flags |= MNT_NOATIME;
	if (flags & MS_NODIRATIME)
		mnt_flags |= MNT_NODIRATIME;
	if (flags & MS_STRICTATIME)
		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
	if (flags & MS_RDONLY)
		mnt_flags |= MNT_READONLY;

	/* The default atime for remount is preservation */
	if ((flags & MS_REMOUNT) &&
	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
		       MS_STRICTATIME)) == 0)) {
		mnt_flags &= ~MNT_ATIME_MASK;
		mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
	}

	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);

	if (flags & MS_REMOUNT)
		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
				    data_page);
	else if (flags & MS_BIND)
		retval = do_loopback(&path, dev_name, flags & MS_REC);
	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
		retval = do_change_type(&path, flags);
	else if (flags & MS_MOVE)
		retval = do_move_mount(&path, dev_name);
	else
		retval = do_new_mount(&path, type_page, flags, mnt_flags,
				      dev_name, data_page);
dput_out:
	path_put(&path);
	return retval;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

跟函数，代码就不继续贴了。do_mount()->do_new_mount()->vfs_kern_mount()->mount_fs()->type中执行回调mount()，这是自定义的挂载函数。

与sysfs的比较

再来对比一下同为伪文件系统的sysfs。
proc文件系统主要是用来调试内核，在内核运行时可以知道内核中一些重要的数据结构的值，一般都是读很少写。
proc文件系统出现的比sys文件系统早，proc文件系统的目录结构比较乱，在proc文件系统下面有很多文件夹，比如一个进程就有一个文件夹，现在内核越来越复杂，支持的设备类型也越来越多，显得很混乱；于是又开发出了sys系统，sys系统可以说是proc的升级，将来用sys系统会是主流。
proc文件系统和sys文件系统都是虚拟系统，并且有对应关系，比如"/proc/misc"对应于"sys/class/misc"下面的设备，都是描述misc类设备的。

总结

如果要实现一个文件系统，主要实现三个流程：

编译生成.ko文件，并通过insmod插入到内核中。如果确认没有异常，可以直接编译进内核代码中。
执行mount命令进行挂载。
读写数据等相关文件操作接口，实现cat、echo、touch等操作。

相关阅读:
Linux命令行如何设置MySQL远程连接
 MDNNSVM
动捕设备VDSuit Full便携式动作捕捉设备，帮你轻松打破次元壁
 TP、FN、FP、TN、准确率、召回率、F1
代码随想录第8章二叉树
 mfc入门基础（六）创建模态对话框与非模态对话框
 k8s--基础--22.13--storageclass--类型--ScaleIO
java毕业设计春之梦理发店管理Mybatis+系统+数据库+调试部署
 132. SAP UI5 Simple Form 控件的使用方法介绍
 云原生之深入解析Jenkins多分支管道
原文地址：https://blog.csdn.net/m0_65931372/article/details/126329911