From: Boaz Harrosh <boazh@xxxxxxxxxx> zuf-root is a psuedo FS that the zus Server communicates through, registers new file-systems. receives new mount requests. In this patch we have the bring up of that special FS. The principal communication with zuf-rootfs is done by doing an open(O_TMPFILE) invoking some IOCTL_XXX on the file. This establishes a zuf_special_file type of object attached to the "file *" and by that defining special behavior for that object (Picture will be clearer in future patches) Otherwise zuf-rootfs is not an FS at all. And has no viewable files The zuf-rootfs (mount -t zuf) is usually by default mounted on /sys/fs/zuf. If an admin wants to run more server applications (Note that each server application supports many types of FSs) He/she can mount a second instance of -t zuf and point the new Server to it. (Otherwise a second instance attempting to communicate with a busy zuf will fail) TODO: How to trigger a first mount on module_load. Currently admin needs to manually "mount -t zuf none /sys/fs/zuf" Signed-off-by: Boaz Harrosh <boazh@xxxxxxxxxx> --- fs/zuf/Makefile | 4 + fs/zuf/_extern.h | 38 +++++ fs/zuf/_pr.h | 43 ++++++ fs/zuf/super.c | 53 +++++++ fs/zuf/zuf-core.c | 60 ++++++++ fs/zuf/zuf-root.c | 347 ++++++++++++++++++++++++++++++++++++++++++++++ fs/zuf/zuf.h | 108 +++++++++++++++ fs/zuf/zus_api.h | 36 +++++ 8 files changed, 689 insertions(+) create mode 100644 fs/zuf/_extern.h create mode 100644 fs/zuf/_pr.h create mode 100644 fs/zuf/super.c create mode 100644 fs/zuf/zuf-core.c create mode 100644 fs/zuf/zuf-root.c create mode 100644 fs/zuf/zuf.h diff --git a/fs/zuf/Makefile b/fs/zuf/Makefile index e75ba8a77974..8e62b4c52150 100644 --- a/fs/zuf/Makefile +++ b/fs/zuf/Makefile @@ -10,5 +10,9 @@ obj-$(CONFIG_ZUF) += zuf.o +# ZUF core +zuf-y += zuf-core.o zuf-root.o + # Main FS +zuf-y += super.o zuf-y += module.o diff --git a/fs/zuf/_extern.h b/fs/zuf/_extern.h new file mode 100644 index 000000000000..3bb9f1d9acf6 --- /dev/null +++ b/fs/zuf/_extern.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh <boazh@xxxxxxxxxx> + * Sagi Manole <sagim@xxxxxxxxxx>" + */ + +#ifndef __ZUF_EXTERN_H__ +#define __ZUF_EXTERN_H__ +/* + * DO NOT INCLUDE this file directly, it is included by zuf.h + * It is here because zuf.h got to big + */ + +/* + * extern functions declarations + */ + +/* super.c */ +int zuf_init_inodecache(void); +void zuf_destroy_inodecache(void); + +struct dentry *zuf_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data); + +/* zuf-core.c */ +long zufc_ioctl(struct file *filp, unsigned int cmd, ulong arg); +int zufc_release(struct inode *inode, struct file *file); +int zufc_mmap(struct file *file, struct vm_area_struct *vma); + +/* zuf-root.c */ +int zufr_register_fs(struct super_block *sb, struct zufs_ioc_register_fs *rfs); + +#endif /*ndef __ZUF_EXTERN_H__*/ diff --git a/fs/zuf/_pr.h b/fs/zuf/_pr.h new file mode 100644 index 000000000000..30b8cf912c1f --- /dev/null +++ b/fs/zuf/_pr.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh <boazh@xxxxxxxxxx> + * Sagi Manole <sagim@xxxxxxxxxx>" + */ + +#ifndef __ZUF_PR_H__ +#define __ZUF_PR_H__ + +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif + +/* + * Debug code + */ +#define zuf_err(s, args ...) pr_err("[%s:%d] " s, __func__, \ + __LINE__, ## args) +#define zuf_err_cnd(silent, s, args ...) \ + do {if (!silent) \ + pr_err("[%s:%d] " s, __func__, __LINE__, ## args); \ + } while (0) +#define zuf_warn(s, args ...) pr_warn("[%s:%d] " s, __func__, \ + __LINE__, ## args) +#define zuf_warn_cnd(silent, s, args ...) \ + do {if (!silent) \ + pr_warn("[%s:%d] " s, __func__, __LINE__, ## args); \ + } while (0) +#define zuf_info(s, args ...) pr_info("~info~ " s, ## args) + +#define zuf_chan_debug(c, s, args...) pr_debug(c " [%s:%d] " s, __func__, \ + __LINE__, ## args) + +/* ~~~ channel prints ~~~ */ +#define zuf_dbg_err(s, args ...) zuf_chan_debug("error", s, ##args) + +#endif /* define __ZUF_PR_H__ */ diff --git a/fs/zuf/super.c b/fs/zuf/super.c new file mode 100644 index 000000000000..f7f7798425a9 --- /dev/null +++ b/fs/zuf/super.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Super block operations. + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh <boazh@xxxxxxxxxx> + * Sagi Manole <sagim@xxxxxxxxxx> + */ + +#include <linux/types.h> +#include <linux/parser.h> +#include <linux/statfs.h> +#include <linux/backing-dev.h> + +#include "zuf.h" + +static struct kmem_cache *zuf_inode_cachep; + +static void _init_once(void *foo) +{ + struct zuf_inode_info *zii = foo; + + inode_init_once(&zii->vfs_inode); +} + +int __init zuf_init_inodecache(void) +{ + zuf_inode_cachep = kmem_cache_create("zuf_inode_cache", + sizeof(struct zuf_inode_info), + 0, + (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD | + SLAB_TYPESAFE_BY_RCU), + _init_once); + if (zuf_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +void zuf_destroy_inodecache(void) +{ + kmem_cache_destroy(zuf_inode_cachep); +} + +struct dentry *zuf_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return ERR_PTR(-ENOTSUPP); +} diff --git a/fs/zuf/zuf-core.c b/fs/zuf/zuf-core.c new file mode 100644 index 000000000000..e12cae584f8a --- /dev/null +++ b/fs/zuf/zuf-core.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BRIEF DESCRIPTION + * + * Ioctl operations. + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh <boazh@xxxxxxxxxx> + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/mm_types.h> +#include <linux/delay.h> +#include <linux/pfn_t.h> +#include <linux/sched/signal.h> + +#include "zuf.h" + +long zufc_ioctl(struct file *file, unsigned int cmd, ulong arg) +{ + switch (cmd) { + default: + zuf_err("%d\n", cmd); + return -ENOTTY; + } +} + +int zufc_release(struct inode *inode, struct file *file) +{ + struct zuf_special_file *zsf = file->private_data; + + if (!zsf) + return 0; + + switch (zsf->type) { + default: + return 0; + } +} + +int zufc_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct zuf_special_file *zsf = file->private_data; + + if (unlikely(!zsf)) { + zuf_err("Which mmap is that !!!!\n"); + return -ENOTTY; + } + + switch (zsf->type) { + default: + zuf_err("type=%d\n", zsf->type); + return -ENOTTY; + } +} diff --git a/fs/zuf/zuf-root.c b/fs/zuf/zuf-root.c new file mode 100644 index 000000000000..55a839dbc854 --- /dev/null +++ b/fs/zuf/zuf-root.c @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ZUF Root filesystem. + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0. See module.c for LICENSE details. + * + * ZUF core is mounted on a small specialized FS that + * provides the communication with the mount thread, zuf multy-channel + * communication [ZTs], and the pmem devices. + * Subsequently all FS super_blocks are children of this root, and point + * to it. All using the same zuf communication multy-channel. + * + * [ + * TODO: + * Multiple servers can run on Multiple mounted roots. Each registering + * their own FSTYPEs. Admin should make sure that the FSTYPEs do not + * overlap + * ] + * + * Authors: + * Boaz Harrosh <boazh@xxxxxxxxxx> + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/magic.h> +#include <asm-generic/mman.h> + +#include "zuf.h" + +/* ~~~~ Register/Unregister FS-types ~~~~ */ +#ifdef CONFIG_LOCKDEP + +/* + * NOTE: When CONFIG_LOCKDEP is on the register_filesystem complains when + * the fstype object is from a kmalloc. Because of some lockdep_keys not + * being const_obj something. + * + * So in this case we have maximum of 16 fstypes system wide + * (Total for all mounted zuf_root(s)). This way we can have them + * in const_obj memory below at g_fs_array + */ + +enum { MAX_LOCKDEP_FSs = 16 }; +static uint g_fs_next; +static struct zuf_fs_type g_fs_array[MAX_LOCKDEP_FSs]; + +static struct zuf_fs_type *_fs_type_alloc(void) +{ + struct zuf_fs_type *ret; + + if (MAX_LOCKDEP_FSs <= g_fs_next) + return NULL; + + ret = &g_fs_array[g_fs_next++]; + memset(ret, 0, sizeof(*ret)); + return ret; +} + +static void _fs_type_free(struct zuf_fs_type *zft) +{ + if (zft == &g_fs_array[0]) + g_fs_next = 0; +} + +#else /* !CONFIG_LOCKDEP*/ +static struct zuf_fs_type *_fs_type_alloc(void) +{ + return kcalloc(1, sizeof(struct zuf_fs_type), GFP_KERNEL); +} + +static void _fs_type_free(struct zuf_fs_type *zft) +{ + kfree(zft); +} +#endif /*CONFIG_LOCKDEP*/ + +int zufr_register_fs(struct super_block *sb, struct zufs_ioc_register_fs *rfs) +{ + struct zuf_fs_type *zft = _fs_type_alloc(); + + if (unlikely(!zft)) + return -ENOMEM; + + /* Original vfs file type */ + zft->vfs_fst.owner = THIS_MODULE; + zft->vfs_fst.name = kstrdup(rfs->rfi.fsname, GFP_KERNEL); + zft->vfs_fst.mount = zuf_mount; + zft->vfs_fst.kill_sb = kill_block_super; + + /* ZUS info about this FS */ + zft->rfi = rfs->rfi; + zft->zus_zfi = rfs->zus_zfi; + INIT_LIST_HEAD(&zft->list); + /* Back pointer to our communication channels */ + zft->zri = ZRI(sb); + + zuf_add_fs_type(zft->zri, zft); + zuf_info("register_filesystem [%s]\n", zft->vfs_fst.name); + return register_filesystem(&zft->vfs_fst); +} + +static void _unregister_all_fses(struct zuf_root_info *zri) +{ + struct zuf_fs_type *zft, *n; + + list_for_each_entry_safe_reverse(zft, n, &zri->fst_list, list) { + unregister_filesystem(&zft->vfs_fst); + list_del_init(&zft->list); + _fs_type_free(zft); + } +} + +static int zufr_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + drop_nlink(inode); + return 0; +} + +/* Force alignment of 2M for all vma(s) + * + * This belongs to t1.c and what it does for mmap. But we do not mind + * that both our mmaps (grab_pmem or ZTs) will be 2M aligned so keep + * it here. And zus mappings just all match perfectly with no need for + * holes. + * FIXME: This is copy/paste from dax-device. It can be very much simplified + * for what we need. + */ +static unsigned long zufr_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long off, off_end, off_align, len_align, addr_align; + unsigned long align = PMD_SIZE; + + if (addr) + goto out; + + off = pgoff << PAGE_SHIFT; + off_end = off + len; + off_align = round_up(off, align); + + if ((off_end <= off_align) || ((off_end - off_align) < align)) + goto out; + + len_align = len + align; + if ((off + len_align) < off) + goto out; + + addr_align = current->mm->get_unmapped_area(filp, addr, len_align, + pgoff, flags); + if (!IS_ERR_VALUE(addr_align)) { + addr_align += (off - addr_align) & (align - 1); + return addr_align; + } + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +static const struct inode_operations zufr_inode_operations; +static const struct file_operations zufr_file_dir_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .llseek = dcache_dir_lseek, + .read = generic_read_dir, + .iterate_shared = dcache_readdir, + .fsync = noop_fsync, + .unlocked_ioctl = zufc_ioctl, +}; +static const struct file_operations zufr_file_reg_operations = { + .fsync = noop_fsync, + .unlocked_ioctl = zufc_ioctl, + .get_unmapped_area = zufr_get_unmapped_area, + .mmap = zufc_mmap, + .release = zufc_release, +}; + +static int zufr_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct zuf_root_info *zri = ZRI(dir->i_sb); + struct inode *inode; + int err; + + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + + /* We need to impersonate device-dax (S_DAX + S_IFCHR) in order to get + * the PMD (huge) page faults and allow RDMA memory access via GUP + * (get_user_pages_longterm). + */ + inode->i_flags = S_DAX; + mode = (mode & ~S_IFREG) | S_IFCHR; /* change file type to char */ + + inode->i_ino = ++zri->next_ino; /* none atomic only one mount thread */ + inode->i_blocks = inode->i_size = 0; + inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_atime = inode->i_ctime; + inode_init_owner(inode, dir, mode); + + inode->i_op = &zufr_inode_operations; + inode->i_fop = &zufr_file_reg_operations; + + err = insert_inode_locked(inode); + if (unlikely(err)) { + zuf_err("[%ld] insert_inode_locked => %d\n", inode->i_ino, err); + goto fail; + } + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + return 0; + +fail: + clear_nlink(inode); + make_bad_inode(inode); + iput(inode); + return err; +} + +static void zufr_put_super(struct super_block *sb) +{ + struct zuf_root_info *zri = ZRI(sb); + + _unregister_all_fses(zri); + + zuf_info("zuf_root umount\n"); +} + +static void zufr_evict_inode(struct inode *inode) +{ + clear_inode(inode); +} + +static const struct inode_operations zufr_inode_operations = { + .lookup = simple_lookup, + + .tmpfile = zufr_tmpfile, + .unlink = zufr_unlink, +}; +static const struct super_operations zufr_super_operations = { + .statfs = simple_statfs, + + .evict_inode = zufr_evict_inode, + .put_super = zufr_put_super, +}; + +#define ZUFR_SUPER_MAGIC 0x1717 + +static int zufr_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr zufr_files[] = { + {""}, + }; + struct zuf_root_info *zri; + struct inode *root_i; + int err; + + zri = kzalloc(sizeof(*zri), GFP_KERNEL); + if (!zri) { + zuf_err_cnd(silent, + "Not enough memory to allocate zuf_root_info\n"); + return -ENOMEM; + } + + err = simple_fill_super(sb, ZUFR_SUPER_MAGIC, zufr_files); + if (unlikely(err)) { + kfree(zri); + return err; + } + + sb->s_op = &zufr_super_operations; + sb->s_fs_info = zri; + zri->sb = sb; + + root_i = sb->s_root->d_inode; + root_i->i_fop = &zufr_file_dir_operations; + root_i->i_op = &zufr_inode_operations; + + mutex_init(&zri->sbl_lock); + INIT_LIST_HEAD(&zri->fst_list); + INIT_LIST_HEAD(&zri->pmem_list); + + return 0; +} + +static struct dentry *zufr_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + struct dentry *ret = mount_single(fs_type, flags, data, zufr_fill_super); + + zuf_info("zuf_root mount [%ld]\n", + IS_ERR_OR_NULL(ret) ? PTR_ERR(ret) : ret->d_inode->i_ino); + return ret; +} + +static struct file_system_type zufr_type = { + .owner = THIS_MODULE, + .name = "zuf", + .mount = zufr_mount, + .kill_sb = kill_litter_super, +}; + +/* Create an /sys/fs/zuf/ directory. to mount on */ +static struct kset *zufr_kset; + +int __init zuf_root_init(void) +{ + int err = zuf_init_inodecache(); + + if (unlikely(err)) + return err; + + zufr_kset = kset_create_and_add("zuf", NULL, fs_kobj); + if (!zufr_kset) { + err = -ENOMEM; + goto un_inodecache; + } + + err = register_filesystem(&zufr_type); + if (unlikely(err)) + goto un_kset; + + return 0; + +un_kset: + kset_unregister(zufr_kset); +un_inodecache: + zuf_destroy_inodecache(); + return err; +} + +void __exit zuf_root_exit(void) +{ + unregister_filesystem(&zufr_type); + kset_unregister(zufr_kset); + zuf_destroy_inodecache(); +} + +module_init(zuf_root_init) +module_exit(zuf_root_exit) diff --git a/fs/zuf/zuf.h b/fs/zuf/zuf.h new file mode 100644 index 000000000000..f979d8cbe60c --- /dev/null +++ b/fs/zuf/zuf.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BRIEF DESCRIPTION + * + * Definitions for the ZUF filesystem. + * + * Copyright (c) 2018 NetApp Inc. All rights reserved. + * + * ZUFS-License: GPL-2.0. See module.c for LICENSE details. + * + * Authors: + * Boaz Harrosh <boazh@xxxxxxxxxx> + * Sagi Manole <sagim@xxxxxxxxxx>" + */ + +#ifndef __ZUF_H +#define __ZUF_H + +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/list.h> +#include <linux/types.h> +#include <linux/xattr.h> +#include <linux/exportfs.h> +#include <linux/page_ref.h> + +#include "zus_api.h" + +#include "_pr.h" + +enum zlfs_e_special_file { + zlfs_e_zt = 1, + zlfs_e_mout_thread, + zlfs_e_pmem, + zlfs_e_dpp_buff, +}; + +struct zuf_special_file { + enum zlfs_e_special_file type; + struct file *file; +}; + +/* This is the zuf-root.c mini filesystem */ +struct zuf_root_info { + struct __mount_thread_info { + struct zuf_special_file zsf; + struct zufs_ioc_mount *zim; + } mount; + + #define SBL_INC 64 + struct sb_is_list { + uint num; + uint max; + struct super_block **array; + } sbl; + struct mutex sbl_lock; + + ulong next_ino; + + struct zuf_threads_pool *_ztp; + + struct super_block *sb; + struct list_head fst_list; + + uint next_pmem_id; + struct list_head pmem_list; +}; + +static inline struct zuf_root_info *ZRI(struct super_block *sb) +{ + struct zuf_root_info *zri = sb->s_fs_info; + + WARN_ON(zri->sb != sb); + return zri; +} + +struct zuf_fs_type { + struct file_system_type vfs_fst; + struct zus_fs_info *zus_zfi; + struct register_fs_info rfi; + struct zuf_root_info *zri; + + struct list_head list; +}; + +static inline void zuf_add_fs_type(struct zuf_root_info *zri, + struct zuf_fs_type *zft) +{ + /* Unlocked for now only one mount-thread with zus */ + list_add(&zft->list, &zri->fst_list); +} + +/* + * ZUF per-inode data in memory + */ +struct zuf_inode_info { + struct inode vfs_inode; +}; + +static inline struct zuf_inode_info *ZUII(struct inode *inode) +{ + return container_of(inode, struct zuf_inode_info, vfs_inode); +} + +/* Keep this include last thing in file */ +#include "_extern.h" + +#endif /* __ZUF_H */ diff --git a/fs/zuf/zus_api.h b/fs/zuf/zus_api.h index f01db11721f4..34e3e1a9a107 100644 --- a/fs/zuf/zus_api.h +++ b/fs/zuf/zus_api.h @@ -66,4 +66,40 @@ #endif /* ndef __KERNEL__ */ +struct zufs_ioc_hdr { + __u32 err; /* IN/OUT must be first */ + __u16 in_len; /* How much to be copied *to* zus */ + __u16 out_max; /* Max receive buffer at dispatch caller */ + __u16 out_start;/* Start of output parameters (to caller) */ + __u16 out_len; /* How much to be copied *from* zus to caller */ + /* can be modified by zus */ + __u32 operation;/* One of e_zufs_operation */ + __u32 offset; /* Start of user buffer in ZT mmap */ + __u32 len; /* Len of user buffer in ZT mmap */ +}; + +/* Register FS */ +/* A cookie from user-mode given in register_fs_info */ +struct zus_fs_info; +struct zufs_ioc_register_fs { + struct zufs_ioc_hdr hdr; + struct zus_fs_info *zus_zfi; + struct register_fs_info { + /* IN */ + char fsname[16]; /* Only 4 chars and a NUL please */ + __u32 FS_magic; /* This is the FS's version && magic */ + __u32 FS_ver_major; /* on disk, not the zuf-to-zus version*/ + __u32 FS_ver_minor; /* (See also struct md_dev_table) */ + + __u8 notused[3]; + __u64 dt_offset; + + __u32 s_time_gran; + __u32 def_mode; + __u64 s_maxbytes; + + } rfi; +}; +#define ZU_IOC_REGISTER_FS _IOWR('Z', 10, struct zufs_ioc_register_fs) + #endif /* _LINUX_ZUFS_API_H */ -- 2.20.1