Re: [RFC PATCH v2 08/12] famfs: module operations & fs_context

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Apr 29, 2024 at 12:04:24PM -0500, John Groves wrote:
> Start building up from the famfs module operations. This commit
> includes the following:
> 
> * Register as a file system
> * Parse mount parameters
> * Allocate or find (and initialize) a superblock via famfs_get_tree()
> * Lookup the host dax device, and bail if it's in use (or not dax)
> * Register as the holder of the dax device if it's available
> * Add Kconfig and Makefile misc to build famfs
> * Add FAMFS_SUPER_MAGIC to include/uapi/linux/magic.h
> * Add export of fs/namei.c:may_open_dev(), which famfs needs to call
> * Update MAINTAINERS file for the fs/famfs/ path
> 
> The following exports had to happen to enable famfs:
> 
> * This uses the new fs/super.c:kill_char_super() - the other kill*super
>   helpers were not quite right.
> * This uses the dev_dax_iomap export of dax_dev_get()
> 
> This commit builds but is otherwise too incomplete to run
> 
> Signed-off-by: John Groves <john@xxxxxxxxxx>
> ---
>  MAINTAINERS                |   1 +
>  fs/Kconfig                 |   2 +
>  fs/Makefile                |   1 +
>  fs/famfs/Kconfig           |  10 ++
>  fs/famfs/Makefile          |   5 +
>  fs/famfs/famfs_inode.c     | 345 +++++++++++++++++++++++++++++++++++++
>  fs/famfs/famfs_internal.h  |  36 ++++
>  fs/namei.c                 |   1 +
>  include/uapi/linux/magic.h |   1 +
>  9 files changed, 402 insertions(+)
>  create mode 100644 fs/famfs/Kconfig
>  create mode 100644 fs/famfs/Makefile
>  create mode 100644 fs/famfs/famfs_inode.c
>  create mode 100644 fs/famfs/famfs_internal.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 3f2d847dcf01..365d678e2f40 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8188,6 +8188,7 @@ L:	linux-cxl@xxxxxxxxxxxxxxx
>  L:	linux-fsdevel@xxxxxxxxxxxxxxx
>  S:	Supported
>  F:	Documentation/filesystems/famfs.rst
> +F:	fs/famfs
>  
>  FANOTIFY
>  M:	Jan Kara <jack@xxxxxxx>
> diff --git a/fs/Kconfig b/fs/Kconfig
> index a46b0cbc4d8f..53b4629e92a0 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -140,6 +140,8 @@ source "fs/autofs/Kconfig"
>  source "fs/fuse/Kconfig"
>  source "fs/overlayfs/Kconfig"
>  
> +source "fs/famfs/Kconfig"
> +
>  menu "Caches"
>  
>  source "fs/netfs/Kconfig"
> diff --git a/fs/Makefile b/fs/Makefile
> index 6ecc9b0a53f2..3393f399a9e9 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -129,3 +129,4 @@ obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
>  obj-$(CONFIG_EROFS_FS)		+= erofs/
>  obj-$(CONFIG_VBOXSF_FS)		+= vboxsf/
>  obj-$(CONFIG_ZONEFS_FS)		+= zonefs/
> +obj-$(CONFIG_FAMFS)             += famfs/
> diff --git a/fs/famfs/Kconfig b/fs/famfs/Kconfig
> new file mode 100644
> index 000000000000..edb8980820f7
> --- /dev/null
> +++ b/fs/famfs/Kconfig
> @@ -0,0 +1,10 @@
> +
> +
> +config FAMFS
> +       tristate "famfs: shared memory file system"
> +       depends on DEV_DAX && FS_DAX && DEV_DAX_IOMAP
> +       help
> +	  Support for the famfs file system. Famfs is a dax file system that
> +	  can support scale-out shared access to fabric-attached memory
> +	  (e.g. CXL shared memory). Famfs is not a general purpose file system;
> +	  it is an enabler for data sets in shared memory.
> diff --git a/fs/famfs/Makefile b/fs/famfs/Makefile
> new file mode 100644
> index 000000000000..62230bcd6793
> --- /dev/null
> +++ b/fs/famfs/Makefile
> @@ -0,0 +1,5 @@
> +# SPDX-License-Identifier: GPL-2.0
> +
> +obj-$(CONFIG_FAMFS) += famfs.o
> +
> +famfs-y := famfs_inode.o
> diff --git a/fs/famfs/famfs_inode.c b/fs/famfs/famfs_inode.c
> new file mode 100644
> index 000000000000..61306240fc0b
> --- /dev/null
> +++ b/fs/famfs/famfs_inode.c
> @@ -0,0 +1,345 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * famfs - dax file system for shared fabric-attached memory
> + *
> + * Copyright 2023-2024 Micron Technology, inc
> + *
> + * This file system, originally based on ramfs the dax support from xfs,
> + * is intended to allow multiple host systems to mount a common file system
> + * view of dax files that map to shared memory.
> + */
> +
> +#include <linux/fs.h>
> +#include <linux/time.h>
> +#include <linux/init.h>
> +#include <linux/string.h>
> +#include <linux/parser.h>
> +#include <linux/magic.h>
> +#include <linux/slab.h>
> +#include <linux/fs_context.h>
> +#include <linux/fs_parser.h>
> +#include <linux/dax.h>
> +#include <linux/hugetlb.h>
> +#include <linux/iomap.h>
> +#include <linux/path.h>
> +#include <linux/namei.h>
> +
> +#include "famfs_internal.h"
> +
> +#define FAMFS_DEFAULT_MODE	0755
> +
> +static struct inode *famfs_get_inode(struct super_block *sb,
> +				     const struct inode *dir,
> +				     umode_t mode, dev_t dev)
> +{
> +	struct inode *inode = new_inode(sb);
> +	struct timespec64 tv;
> +
> +	if (!inode)
> +		return NULL;
> +
> +	inode->i_ino = get_next_ino();
> +	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
> +	inode->i_mapping->a_ops = &ram_aops;
> +	mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> +	mapping_set_unevictable(inode->i_mapping);
> +	tv = inode_set_ctime_current(inode);
> +	inode_set_mtime_to_ts(inode, tv);
> +	inode_set_atime_to_ts(inode, tv);
> +
> +	switch (mode & S_IFMT) {
> +	default:
> +		init_special_inode(inode, mode, dev);
> +		break;
> +	case S_IFREG:
> +		inode->i_op = NULL /* famfs_file_inode_operations */;
> +		inode->i_fop = NULL /* &famfs_file_operations */;
> +		break;
> +	case S_IFDIR:
> +		inode->i_op = NULL /* famfs_dir_inode_operations */;
> +		inode->i_fop = &simple_dir_operations;
> +
> +		/* Directory inodes start off with i_nlink == 2 (for ".") */
> +		inc_nlink(inode);
> +		break;
> +	case S_IFLNK:
> +		inode->i_op = &page_symlink_inode_operations;
> +		inode_nohighmem(inode);
> +		break;
> +	}
> +	return inode;
> +}
> +
> +/*
> + * famfs dax_operations  (for char dax)
> + */
> +static int
> +famfs_dax_notify_failure(struct dax_device *dax_dev, u64 offset,
> +			u64 len, int mf_flags)
> +{
> +	struct super_block *sb = dax_holder(dax_dev);
> +	struct famfs_fs_info *fsi = sb->s_fs_info;
> +
> +	pr_err("%s: rootdev=%s offset=%lld len=%llu flags=%x\n", __func__,
> +	       fsi->rootdev, offset, len, mf_flags);
> +
> +	return 0;
> +}
> +
> +static const struct dax_holder_operations famfs_dax_holder_ops = {
> +	.notify_failure		= famfs_dax_notify_failure,
> +};
> +
> +/*****************************************************************************
> + * fs_context_operations
> + */
> +
> +static int
> +famfs_fill_super(struct super_block *sb, struct fs_context *fc)
> +{
> +	int rc = 0;
> +
> +	sb->s_maxbytes		= MAX_LFS_FILESIZE;
> +	sb->s_blocksize		= PAGE_SIZE;
> +	sb->s_blocksize_bits	= PAGE_SHIFT;
> +	sb->s_magic		= FAMFS_SUPER_MAGIC;
> +	sb->s_op		= NULL /* famfs_super_ops */;
> +	sb->s_time_gran		= 1;
> +
> +	return rc;
> +}
> +
> +static int
> +lookup_daxdev(const char *pathname, dev_t *devno)
> +{
> +	struct inode *inode;
> +	struct path path;
> +	int err;
> +
> +	if (!pathname || !*pathname)
> +		return -EINVAL;
> +
> +	err = kern_path(pathname, LOOKUP_FOLLOW, &path);
> +	if (err)
> +		return err;
> +
> +	inode = d_backing_inode(path.dentry);
> +	if (!S_ISCHR(inode->i_mode)) {
> +		err = -EINVAL;
> +		goto out_path_put;
> +	}
> +
> +	if (!may_open_dev(&path)) { /* had to export this */
> +		err = -EACCES;
> +		goto out_path_put;
> +	}
> +
> +	 /* if it's dax, i_rdev is struct dax_device */
> +	*devno = inode->i_rdev;
> +
> +out_path_put:
> +	path_put(&path);
> +	return err;
> +}
> +
> +static int
> +famfs_get_tree(struct fs_context *fc)
> +{
> +	struct famfs_fs_info *fsi = fc->s_fs_info;
> +	struct dax_device *dax_devp;
> +	struct super_block *sb;
> +	struct inode *inode;
> +	dev_t daxdevno;
> +	int err;
> +
> +	/* TODO: clean up chatty messages */
> +
> +	err = lookup_daxdev(fc->source, &daxdevno);
> +	if (err)
> +		return err;
> +
> +	fsi->daxdevno = daxdevno;
> +
> +	/* This will set sb->s_dev=daxdevno */
> +	sb = sget_dev(fc, daxdevno);

This will open the dax device as a block device. However, nothing in
your ->kill_sb method or kill_char_super() closes it again. So you're
leaking block device references and leaving unitialized memory around as
you've claimed that device but never ended your claim.

> +	if (IS_ERR(sb)) {
> +		pr_err("%s: sget_dev error\n", __func__);
> +		return PTR_ERR(sb);
> +	}
> +
> +	if (sb->s_root) {
> +		pr_info("%s: found a matching suerblock for %s\n",
> +			__func__, fc->source);
> +
> +		/* We don't expect to find a match by dev_t; if we do, it must
> +		 * already be mounted, so we bail
> +		 */
> +		err = -EBUSY;
> +		goto deactivate_out;
> +	} else {
> +		pr_info("%s: initializing new superblock for %s\n",
> +			__func__, fc->source);
> +		err = famfs_fill_super(sb, fc);
> +		if (err)
> +			goto deactivate_out;
> +	}
> +
> +	/* This will fail if it's not a dax device */
> +	dax_devp = dax_dev_get(daxdevno);
> +	if (!dax_devp) {
> +		pr_warn("%s: device %s not found or not dax\n",
> +		       __func__, fc->source);
> +		err = -ENODEV;
> +		goto deactivate_out;
> +	}
> +
> +	err = fs_dax_get(dax_devp, sb, &famfs_dax_holder_ops);
> +	if (err) {
> +		pr_err("%s: fs_dax_get(%lld) failed\n", __func__, (u64)daxdevno);
> +		err = -EBUSY;
> +		goto deactivate_out;
> +	}
> +	fsi->dax_devp = dax_devp;
> +
> +	inode = famfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
> +	sb->s_root = d_make_root(inode);
> +	if (!sb->s_root) {
> +		pr_err("%s: d_make_root() failed\n", __func__);
> +		err = -ENOMEM;
> +		fs_put_dax(fsi->dax_devp, sb);
> +		goto deactivate_out;
> +	}
> +
> +	sb->s_flags |= SB_ACTIVE;
> +
> +	WARN_ON(fc->root);
> +	fc->root = dget(sb->s_root);
> +	return err;
> +
> +deactivate_out:
> +	pr_debug("%s: deactivating sb=%llx\n", __func__, (u64)sb);
> +	deactivate_locked_super(sb);
> +	return err;
> +}
> +
> +/*****************************************************************************/
> +
> +enum famfs_param {
> +	Opt_mode,
> +	Opt_dax,
> +};
> +
> +const struct fs_parameter_spec famfs_fs_parameters[] = {
> +	fsparam_u32oct("mode",	  Opt_mode),
> +	fsparam_string("dax",     Opt_dax),
> +	{}
> +};
> +
> +static int famfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
> +{
> +	struct famfs_fs_info *fsi = fc->s_fs_info;
> +	struct fs_parse_result result;
> +	int opt;
> +
> +	opt = fs_parse(fc, famfs_fs_parameters, param, &result);
> +	if (opt == -ENOPARAM) {
> +		opt = vfs_parse_fs_param_source(fc, param);
> +		if (opt != -ENOPARAM)
> +			return opt;

This shouldn't be needed. The VFS will handle all that for you.

> +
> +		return 0;
> +	}
> +	if (opt < 0)
> +		return opt;
> +
> +	switch (opt) {
> +	case Opt_mode:
> +		fsi->mount_opts.mode = result.uint_32 & S_IALLUGO;
> +		break;
> +	case Opt_dax:
> +		if (strcmp(param->string, "always"))
> +			pr_notice("%s: invalid dax mode %s\n",
> +				  __func__, param->string);
> +		break;
> +	}
> +
> +	return 0;
> +}
> +
> +static void famfs_free_fc(struct fs_context *fc)
> +{
> +	struct famfs_fs_info *fsi = fc->s_fs_info;
> +
> +	if (fsi && fsi->rootdev)
> +		kfree(fsi->rootdev);

Dead code since rootdev is unused an unset?

> +
> +	kfree(fsi);
> +}
> +
> +static const struct fs_context_operations famfs_context_ops = {
> +	.free		= famfs_free_fc,
> +	.parse_param	= famfs_parse_param,
> +	.get_tree	= famfs_get_tree,
> +};
> +
> +static int famfs_init_fs_context(struct fs_context *fc)
> +{
> +	struct famfs_fs_info *fsi;
> +
> +	fsi = kzalloc(sizeof(*fsi), GFP_KERNEL);
> +	if (!fsi)
> +		return -ENOMEM;
> +
> +	fsi->mount_opts.mode = FAMFS_DEFAULT_MODE;
> +	fc->s_fs_info        = fsi;
> +	fc->ops              = &famfs_context_ops;
> +	return 0;
> +}
> +
> +static void famfs_kill_sb(struct super_block *sb)
> +{
> +	struct famfs_fs_info *fsi = sb->s_fs_info;
> +
> +	if (fsi->dax_devp)
> +		fs_put_dax(fsi->dax_devp, sb);
> +	if (fsi && fsi->rootdev)
> +		kfree(fsi->rootdev);
> +	kfree(fsi);
> +	sb->s_fs_info = NULL;
> +
> +	kill_char_super(sb); /* new */
> +}

Can likely just be

static void famfs_kill_sb(struct super_block *sb)
{
	struct famfs_fs_info *fsi = sb->s_fs_info;

	generic_shutdown_super(sb);

        if (sb->s_bdev_file)
		bdev_fput(sb->s_bdev_file);

	if (fsi->dax_devp)
		fs_put_dax(fsi->dax_devp, sb);

	kfree(fsi);
}

and then you don't need any custom helpers at all.

> +
> +#define MODULE_NAME "famfs"
> +static struct file_system_type famfs_fs_type = {
> +	.name		  = MODULE_NAME,
> +	.init_fs_context  = famfs_init_fs_context,
> +	.parameters	  = famfs_fs_parameters,
> +	.kill_sb	  = famfs_kill_sb,
> +	.fs_flags	  = FS_USERNS_MOUNT,

Sorry, no. This should not be mountable by unprivileged users and
containers if it's using a real device and especially not since it's not
even a mature filesystem.

> +};
> +
> +/******************************************************************************
> + * Module stuff
> + */
> +static int __init init_famfs_fs(void)
> +{
> +	int rc;
> +
> +	rc = register_filesystem(&famfs_fs_type);
> +
> +	return rc;
> +}
> +
> +static void
> +__exit famfs_exit(void)
> +{
> +	unregister_filesystem(&famfs_fs_type);
> +	pr_info("%s: unregistered\n", __func__);
> +}
> +
> +fs_initcall(init_famfs_fs);
> +module_exit(famfs_exit);
> +
> +MODULE_AUTHOR("John Groves, Micron Technology");
> +MODULE_LICENSE("GPL");
> diff --git a/fs/famfs/famfs_internal.h b/fs/famfs/famfs_internal.h
> new file mode 100644
> index 000000000000..951b32ec4fbd
> --- /dev/null
> +++ b/fs/famfs/famfs_internal.h
> @@ -0,0 +1,36 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * famfs - dax file system for shared fabric-attached memory
> + *
> + * Copyright 2023-2024 Micron Technology, Inc.
> + *
> + * This file system, originally based on ramfs the dax support from xfs,
> + * is intended to allow multiple host systems to mount a common file system
> + * view of dax files that map to shared memory.
> + */
> +#ifndef FAMFS_INTERNAL_H
> +#define FAMFS_INTERNAL_H
> +
> +struct famfs_mount_opts {
> +	umode_t mode;
> +};
> +
> +/**
> + * @famfs_fs_info
> + *
> + * @mount_opts: the mount options
> + * @dax_devp:   The underlying character devdax device
> + * @rootdev:    Dax device path used in mount
> + * @daxdevno:   Dax device dev_t
> + * @deverror:   True if the dax device has called our notify_failure entry
> + *              point, or if other "shutdown" conditions exist
> + */
> +struct famfs_fs_info {
> +	struct famfs_mount_opts  mount_opts;
> +	struct dax_device       *dax_devp;
> +	char                    *rootdev;
> +	dev_t                    daxdevno;
> +	bool                     deverror;
> +};
> +
> +#endif /* FAMFS_INTERNAL_H */
> diff --git a/fs/namei.c b/fs/namei.c
> index c5b2a25be7d0..f24b268473cd 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -3229,6 +3229,7 @@ bool may_open_dev(const struct path *path)
>  	return !(path->mnt->mnt_flags & MNT_NODEV) &&
>  		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
>  }
> +EXPORT_SYMBOL(may_open_dev);
>  
>  static int may_open(struct mnt_idmap *idmap, const struct path *path,
>  		    int acc_mode, int flag)
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index 1b40a968ba91..e9bdd6a415e2 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -37,6 +37,7 @@
>  #define HOSTFS_SUPER_MAGIC	0x00c0ffee
>  #define OVERLAYFS_SUPER_MAGIC	0x794c7630
>  #define FUSE_SUPER_MAGIC	0x65735546
> +#define FAMFS_SUPER_MAGIC	0x87b282ff
>  
>  #define MINIX_SUPER_MAGIC	0x137F		/* minix v1 fs, 14 char names */
>  #define MINIX_SUPER_MAGIC2	0x138F		/* minix v1 fs, 30 char names */
> -- 
> 2.43.0
> 




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux