On Mon, Apr 29, 2024 at 12:04:24PM -0500, John Groves wrote: > Start building up from the famfs module operations. This commit > includes the following: > > * Register as a file system > * Parse mount parameters > * Allocate or find (and initialize) a superblock via famfs_get_tree() > * Lookup the host dax device, and bail if it's in use (or not dax) > * Register as the holder of the dax device if it's available > * Add Kconfig and Makefile misc to build famfs > * Add FAMFS_SUPER_MAGIC to include/uapi/linux/magic.h > * Add export of fs/namei.c:may_open_dev(), which famfs needs to call > * Update MAINTAINERS file for the fs/famfs/ path > > The following exports had to happen to enable famfs: > > * This uses the new fs/super.c:kill_char_super() - the other kill*super > helpers were not quite right. > * This uses the dev_dax_iomap export of dax_dev_get() > > This commit builds but is otherwise too incomplete to run > > Signed-off-by: John Groves <john@xxxxxxxxxx> > --- > MAINTAINERS | 1 + > fs/Kconfig | 2 + > fs/Makefile | 1 + > fs/famfs/Kconfig | 10 ++ > fs/famfs/Makefile | 5 + > fs/famfs/famfs_inode.c | 345 +++++++++++++++++++++++++++++++++++++ > fs/famfs/famfs_internal.h | 36 ++++ > fs/namei.c | 1 + > include/uapi/linux/magic.h | 1 + > 9 files changed, 402 insertions(+) > create mode 100644 fs/famfs/Kconfig > create mode 100644 fs/famfs/Makefile > create mode 100644 fs/famfs/famfs_inode.c > create mode 100644 fs/famfs/famfs_internal.h > > diff --git a/MAINTAINERS b/MAINTAINERS > index 3f2d847dcf01..365d678e2f40 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -8188,6 +8188,7 @@ L: linux-cxl@xxxxxxxxxxxxxxx > L: linux-fsdevel@xxxxxxxxxxxxxxx > S: Supported > F: Documentation/filesystems/famfs.rst > +F: fs/famfs > > FANOTIFY > M: Jan Kara <jack@xxxxxxx> > diff --git a/fs/Kconfig b/fs/Kconfig > index a46b0cbc4d8f..53b4629e92a0 100644 > --- a/fs/Kconfig > +++ b/fs/Kconfig > @@ -140,6 +140,8 @@ source "fs/autofs/Kconfig" > source "fs/fuse/Kconfig" > source "fs/overlayfs/Kconfig" > > +source "fs/famfs/Kconfig" > + > menu "Caches" > > source "fs/netfs/Kconfig" > diff --git a/fs/Makefile b/fs/Makefile > index 6ecc9b0a53f2..3393f399a9e9 100644 > --- a/fs/Makefile > +++ b/fs/Makefile > @@ -129,3 +129,4 @@ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ > obj-$(CONFIG_EROFS_FS) += erofs/ > obj-$(CONFIG_VBOXSF_FS) += vboxsf/ > obj-$(CONFIG_ZONEFS_FS) += zonefs/ > +obj-$(CONFIG_FAMFS) += famfs/ > diff --git a/fs/famfs/Kconfig b/fs/famfs/Kconfig > new file mode 100644 > index 000000000000..edb8980820f7 > --- /dev/null > +++ b/fs/famfs/Kconfig > @@ -0,0 +1,10 @@ > + > + > +config FAMFS > + tristate "famfs: shared memory file system" > + depends on DEV_DAX && FS_DAX && DEV_DAX_IOMAP > + help > + Support for the famfs file system. Famfs is a dax file system that > + can support scale-out shared access to fabric-attached memory > + (e.g. CXL shared memory). Famfs is not a general purpose file system; > + it is an enabler for data sets in shared memory. > diff --git a/fs/famfs/Makefile b/fs/famfs/Makefile > new file mode 100644 > index 000000000000..62230bcd6793 > --- /dev/null > +++ b/fs/famfs/Makefile > @@ -0,0 +1,5 @@ > +# SPDX-License-Identifier: GPL-2.0 > + > +obj-$(CONFIG_FAMFS) += famfs.o > + > +famfs-y := famfs_inode.o > diff --git a/fs/famfs/famfs_inode.c b/fs/famfs/famfs_inode.c > new file mode 100644 > index 000000000000..61306240fc0b > --- /dev/null > +++ b/fs/famfs/famfs_inode.c > @@ -0,0 +1,345 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * famfs - dax file system for shared fabric-attached memory > + * > + * Copyright 2023-2024 Micron Technology, inc > + * > + * This file system, originally based on ramfs the dax support from xfs, > + * is intended to allow multiple host systems to mount a common file system > + * view of dax files that map to shared memory. > + */ > + > +#include <linux/fs.h> > +#include <linux/time.h> > +#include <linux/init.h> > +#include <linux/string.h> > +#include <linux/parser.h> > +#include <linux/magic.h> > +#include <linux/slab.h> > +#include <linux/fs_context.h> > +#include <linux/fs_parser.h> > +#include <linux/dax.h> > +#include <linux/hugetlb.h> > +#include <linux/iomap.h> > +#include <linux/path.h> > +#include <linux/namei.h> > + > +#include "famfs_internal.h" > + > +#define FAMFS_DEFAULT_MODE 0755 > + > +static struct inode *famfs_get_inode(struct super_block *sb, > + const struct inode *dir, > + umode_t mode, dev_t dev) > +{ > + struct inode *inode = new_inode(sb); > + struct timespec64 tv; > + > + if (!inode) > + return NULL; > + > + inode->i_ino = get_next_ino(); > + inode_init_owner(&nop_mnt_idmap, inode, dir, mode); > + inode->i_mapping->a_ops = &ram_aops; > + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); > + mapping_set_unevictable(inode->i_mapping); > + tv = inode_set_ctime_current(inode); > + inode_set_mtime_to_ts(inode, tv); > + inode_set_atime_to_ts(inode, tv); > + > + switch (mode & S_IFMT) { > + default: > + init_special_inode(inode, mode, dev); > + break; > + case S_IFREG: > + inode->i_op = NULL /* famfs_file_inode_operations */; > + inode->i_fop = NULL /* &famfs_file_operations */; > + break; > + case S_IFDIR: > + inode->i_op = NULL /* famfs_dir_inode_operations */; > + inode->i_fop = &simple_dir_operations; > + > + /* Directory inodes start off with i_nlink == 2 (for ".") */ > + inc_nlink(inode); > + break; > + case S_IFLNK: > + inode->i_op = &page_symlink_inode_operations; > + inode_nohighmem(inode); > + break; > + } > + return inode; > +} > + > +/* > + * famfs dax_operations (for char dax) > + */ > +static int > +famfs_dax_notify_failure(struct dax_device *dax_dev, u64 offset, > + u64 len, int mf_flags) > +{ > + struct super_block *sb = dax_holder(dax_dev); > + struct famfs_fs_info *fsi = sb->s_fs_info; > + > + pr_err("%s: rootdev=%s offset=%lld len=%llu flags=%x\n", __func__, > + fsi->rootdev, offset, len, mf_flags); > + > + return 0; > +} > + > +static const struct dax_holder_operations famfs_dax_holder_ops = { > + .notify_failure = famfs_dax_notify_failure, > +}; > + > +/***************************************************************************** > + * fs_context_operations > + */ > + > +static int > +famfs_fill_super(struct super_block *sb, struct fs_context *fc) > +{ > + int rc = 0; > + > + sb->s_maxbytes = MAX_LFS_FILESIZE; > + sb->s_blocksize = PAGE_SIZE; > + sb->s_blocksize_bits = PAGE_SHIFT; > + sb->s_magic = FAMFS_SUPER_MAGIC; > + sb->s_op = NULL /* famfs_super_ops */; > + sb->s_time_gran = 1; > + > + return rc; > +} > + > +static int > +lookup_daxdev(const char *pathname, dev_t *devno) > +{ > + struct inode *inode; > + struct path path; > + int err; > + > + if (!pathname || !*pathname) > + return -EINVAL; > + > + err = kern_path(pathname, LOOKUP_FOLLOW, &path); > + if (err) > + return err; > + > + inode = d_backing_inode(path.dentry); > + if (!S_ISCHR(inode->i_mode)) { > + err = -EINVAL; > + goto out_path_put; > + } > + > + if (!may_open_dev(&path)) { /* had to export this */ > + err = -EACCES; > + goto out_path_put; > + } > + > + /* if it's dax, i_rdev is struct dax_device */ > + *devno = inode->i_rdev; > + > +out_path_put: > + path_put(&path); > + return err; > +} > + > +static int > +famfs_get_tree(struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi = fc->s_fs_info; > + struct dax_device *dax_devp; > + struct super_block *sb; > + struct inode *inode; > + dev_t daxdevno; > + int err; > + > + /* TODO: clean up chatty messages */ > + > + err = lookup_daxdev(fc->source, &daxdevno); > + if (err) > + return err; > + > + fsi->daxdevno = daxdevno; > + > + /* This will set sb->s_dev=daxdevno */ > + sb = sget_dev(fc, daxdevno); This will open the dax device as a block device. However, nothing in your ->kill_sb method or kill_char_super() closes it again. So you're leaking block device references and leaving unitialized memory around as you've claimed that device but never ended your claim. > + if (IS_ERR(sb)) { > + pr_err("%s: sget_dev error\n", __func__); > + return PTR_ERR(sb); > + } > + > + if (sb->s_root) { > + pr_info("%s: found a matching suerblock for %s\n", > + __func__, fc->source); > + > + /* We don't expect to find a match by dev_t; if we do, it must > + * already be mounted, so we bail > + */ > + err = -EBUSY; > + goto deactivate_out; > + } else { > + pr_info("%s: initializing new superblock for %s\n", > + __func__, fc->source); > + err = famfs_fill_super(sb, fc); > + if (err) > + goto deactivate_out; > + } > + > + /* This will fail if it's not a dax device */ > + dax_devp = dax_dev_get(daxdevno); > + if (!dax_devp) { > + pr_warn("%s: device %s not found or not dax\n", > + __func__, fc->source); > + err = -ENODEV; > + goto deactivate_out; > + } > + > + err = fs_dax_get(dax_devp, sb, &famfs_dax_holder_ops); > + if (err) { > + pr_err("%s: fs_dax_get(%lld) failed\n", __func__, (u64)daxdevno); > + err = -EBUSY; > + goto deactivate_out; > + } > + fsi->dax_devp = dax_devp; > + > + inode = famfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); > + sb->s_root = d_make_root(inode); > + if (!sb->s_root) { > + pr_err("%s: d_make_root() failed\n", __func__); > + err = -ENOMEM; > + fs_put_dax(fsi->dax_devp, sb); > + goto deactivate_out; > + } > + > + sb->s_flags |= SB_ACTIVE; > + > + WARN_ON(fc->root); > + fc->root = dget(sb->s_root); > + return err; > + > +deactivate_out: > + pr_debug("%s: deactivating sb=%llx\n", __func__, (u64)sb); > + deactivate_locked_super(sb); > + return err; > +} > + > +/*****************************************************************************/ > + > +enum famfs_param { > + Opt_mode, > + Opt_dax, > +}; > + > +const struct fs_parameter_spec famfs_fs_parameters[] = { > + fsparam_u32oct("mode", Opt_mode), > + fsparam_string("dax", Opt_dax), > + {} > +}; > + > +static int famfs_parse_param(struct fs_context *fc, struct fs_parameter *param) > +{ > + struct famfs_fs_info *fsi = fc->s_fs_info; > + struct fs_parse_result result; > + int opt; > + > + opt = fs_parse(fc, famfs_fs_parameters, param, &result); > + if (opt == -ENOPARAM) { > + opt = vfs_parse_fs_param_source(fc, param); > + if (opt != -ENOPARAM) > + return opt; This shouldn't be needed. The VFS will handle all that for you. > + > + return 0; > + } > + if (opt < 0) > + return opt; > + > + switch (opt) { > + case Opt_mode: > + fsi->mount_opts.mode = result.uint_32 & S_IALLUGO; > + break; > + case Opt_dax: > + if (strcmp(param->string, "always")) > + pr_notice("%s: invalid dax mode %s\n", > + __func__, param->string); > + break; > + } > + > + return 0; > +} > + > +static void famfs_free_fc(struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi = fc->s_fs_info; > + > + if (fsi && fsi->rootdev) > + kfree(fsi->rootdev); Dead code since rootdev is unused an unset? > + > + kfree(fsi); > +} > + > +static const struct fs_context_operations famfs_context_ops = { > + .free = famfs_free_fc, > + .parse_param = famfs_parse_param, > + .get_tree = famfs_get_tree, > +}; > + > +static int famfs_init_fs_context(struct fs_context *fc) > +{ > + struct famfs_fs_info *fsi; > + > + fsi = kzalloc(sizeof(*fsi), GFP_KERNEL); > + if (!fsi) > + return -ENOMEM; > + > + fsi->mount_opts.mode = FAMFS_DEFAULT_MODE; > + fc->s_fs_info = fsi; > + fc->ops = &famfs_context_ops; > + return 0; > +} > + > +static void famfs_kill_sb(struct super_block *sb) > +{ > + struct famfs_fs_info *fsi = sb->s_fs_info; > + > + if (fsi->dax_devp) > + fs_put_dax(fsi->dax_devp, sb); > + if (fsi && fsi->rootdev) > + kfree(fsi->rootdev); > + kfree(fsi); > + sb->s_fs_info = NULL; > + > + kill_char_super(sb); /* new */ > +} Can likely just be static void famfs_kill_sb(struct super_block *sb) { struct famfs_fs_info *fsi = sb->s_fs_info; generic_shutdown_super(sb); if (sb->s_bdev_file) bdev_fput(sb->s_bdev_file); if (fsi->dax_devp) fs_put_dax(fsi->dax_devp, sb); kfree(fsi); } and then you don't need any custom helpers at all. > + > +#define MODULE_NAME "famfs" > +static struct file_system_type famfs_fs_type = { > + .name = MODULE_NAME, > + .init_fs_context = famfs_init_fs_context, > + .parameters = famfs_fs_parameters, > + .kill_sb = famfs_kill_sb, > + .fs_flags = FS_USERNS_MOUNT, Sorry, no. This should not be mountable by unprivileged users and containers if it's using a real device and especially not since it's not even a mature filesystem. > +}; > + > +/****************************************************************************** > + * Module stuff > + */ > +static int __init init_famfs_fs(void) > +{ > + int rc; > + > + rc = register_filesystem(&famfs_fs_type); > + > + return rc; > +} > + > +static void > +__exit famfs_exit(void) > +{ > + unregister_filesystem(&famfs_fs_type); > + pr_info("%s: unregistered\n", __func__); > +} > + > +fs_initcall(init_famfs_fs); > +module_exit(famfs_exit); > + > +MODULE_AUTHOR("John Groves, Micron Technology"); > +MODULE_LICENSE("GPL"); > diff --git a/fs/famfs/famfs_internal.h b/fs/famfs/famfs_internal.h > new file mode 100644 > index 000000000000..951b32ec4fbd > --- /dev/null > +++ b/fs/famfs/famfs_internal.h > @@ -0,0 +1,36 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* > + * famfs - dax file system for shared fabric-attached memory > + * > + * Copyright 2023-2024 Micron Technology, Inc. > + * > + * This file system, originally based on ramfs the dax support from xfs, > + * is intended to allow multiple host systems to mount a common file system > + * view of dax files that map to shared memory. > + */ > +#ifndef FAMFS_INTERNAL_H > +#define FAMFS_INTERNAL_H > + > +struct famfs_mount_opts { > + umode_t mode; > +}; > + > +/** > + * @famfs_fs_info > + * > + * @mount_opts: the mount options > + * @dax_devp: The underlying character devdax device > + * @rootdev: Dax device path used in mount > + * @daxdevno: Dax device dev_t > + * @deverror: True if the dax device has called our notify_failure entry > + * point, or if other "shutdown" conditions exist > + */ > +struct famfs_fs_info { > + struct famfs_mount_opts mount_opts; > + struct dax_device *dax_devp; > + char *rootdev; > + dev_t daxdevno; > + bool deverror; > +}; > + > +#endif /* FAMFS_INTERNAL_H */ > diff --git a/fs/namei.c b/fs/namei.c > index c5b2a25be7d0..f24b268473cd 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -3229,6 +3229,7 @@ bool may_open_dev(const struct path *path) > return !(path->mnt->mnt_flags & MNT_NODEV) && > !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); > } > +EXPORT_SYMBOL(may_open_dev); > > static int may_open(struct mnt_idmap *idmap, const struct path *path, > int acc_mode, int flag) > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h > index 1b40a968ba91..e9bdd6a415e2 100644 > --- a/include/uapi/linux/magic.h > +++ b/include/uapi/linux/magic.h > @@ -37,6 +37,7 @@ > #define HOSTFS_SUPER_MAGIC 0x00c0ffee > #define OVERLAYFS_SUPER_MAGIC 0x794c7630 > #define FUSE_SUPER_MAGIC 0x65735546 > +#define FAMFS_SUPER_MAGIC 0x87b282ff > > #define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */ > #define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */ > -- > 2.43.0 >