Re: [RFC PATCH 04/17] zuf: zuf-core The ZTs

"Schumaker, Anna" <Anna.Schumaker@xxxxxxxxxx> · Tue, 26 Feb 2019 18:34:08 +0000

On Tue, 2019-02-19 at 13:51 +0200, Boaz harrosh wrote:
> NetApp Security WARNING: This is an external email. Do not click links or open
> attachments unless you recognize the sender and know the content is safe.
> 
> 
> 
> 
> From: Boaz Harrosh <boazh@xxxxxxxxxx>
> 
> zuf-core established the communication channels with the ZUS
> User Mode Server.
> 
> In this patch we have the core communication mechanics.
> Which is the Novelty of this project.
> (See previous submitted documentation for more info)
> 
> Users will come later in the patchset
> 
> Signed-off-by: Boaz Harrosh <boazh@xxxxxxxxxx>
> ---
>  fs/zuf/_extern.h  |   22 +
>  fs/zuf/_pr.h      |    4 +
>  fs/zuf/relay.h    |   88 ++++
>  fs/zuf/zuf-core.c | 1016 ++++++++++++++++++++++++++++++++++++++++++++-
>  fs/zuf/zuf-root.c |    7 +
>  fs/zuf/zuf.h      |   46 ++
>  fs/zuf/zus_api.h  |  185 +++++++++
>  7 files changed, 1367 insertions(+), 1 deletion(-)
>  create mode 100644 fs/zuf/relay.h
> 
> diff --git a/fs/zuf/_extern.h b/fs/zuf/_extern.h
> index 3bb9f1d9acf6..52bb6b9deafe 100644
> --- a/fs/zuf/_extern.h
> +++ b/fs/zuf/_extern.h
> @@ -28,10 +28,32 @@ struct dentry *zuf_mount(struct file_system_type *fs_type,
> int flags,
>                          const char *dev_name, void *data);
> 
>  /* zuf-core.c */
> +int zufc_zts_init(struct zuf_root_info *zri); /* Some private types in core
> */
> +void zufc_zts_fini(struct zuf_root_info *zri);
> +
>  long zufc_ioctl(struct file *filp, unsigned int cmd, ulong arg);
>  int zufc_release(struct inode *inode, struct file *file);
>  int zufc_mmap(struct file *file, struct vm_area_struct *vma);
> 
> +int __zufc_dispatch_mount(struct zuf_root_info *zri,
> +                         enum e_mount_operation op,
> +                         struct zufs_ioc_mount *zim);
> +int zufc_dispatch_mount(struct zuf_root_info *zri, struct zus_fs_info
> *zus_zfi,
> +                       enum e_mount_operation operation,
> +                       struct zufs_ioc_mount *zim);
> +
> +const char *zuf_op_name(enum e_zufs_operation op);
> +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo);
> +static inline
> +int zufc_dispatch(struct zuf_root_info *zri, struct zufs_ioc_hdr *hdr,
> +                 struct page **pages, uint nump)
> +{
> +       struct zuf_dispatch_op zdo;
> +
> +       zuf_dispatch_init(&zdo, hdr, pages, nump);
> +       return __zufc_dispatch(zri, &zdo);
> +}
> +
>  /* zuf-root.c */
>  int zufr_register_fs(struct super_block *sb, struct zufs_ioc_register_fs
> *rfs);
> 
> diff --git a/fs/zuf/_pr.h b/fs/zuf/_pr.h
> index 30b8cf912c1f..dc9f85453890 100644
> --- a/fs/zuf/_pr.h
> +++ b/fs/zuf/_pr.h
> @@ -39,5 +39,9 @@
> 
>  /* ~~~ channel prints ~~~ */
>  #define zuf_dbg_err(s, args ...)       zuf_chan_debug("error", s, ##args)
> +#define zuf_dbg_vfs(s, args ...)       zuf_chan_debug("vfs  ", s, ##args)
> +#define zuf_dbg_core(s, args ...)      zuf_chan_debug("core ", s, ##args)
> +#define zuf_dbg_zus(s, args ...)       zuf_chan_debug("zusdg", s, ##args)
> +#define zuf_dbg_verbose(s, args ...)   zuf_chan_debug("d-oto", s, ##args)
> 
>  #endif /* define __ZUF_PR_H__ */
> diff --git a/fs/zuf/relay.h b/fs/zuf/relay.h
> new file mode 100644
> index 000000000000..a17d242b313a
> --- /dev/null
> +++ b/fs/zuf/relay.h
> @@ -0,0 +1,88 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Relay scheduler-object Header file.
> + *
> + * Copyright (c) 2018 NetApp Inc. All rights reserved.
> + *
> + * ZUFS-License: GPL-2.0. See module.c for LICENSE details.
> + *
> + * Authors:
> + *     Boaz Harrosh <boazh@xxxxxxxxxx>
> + */
> +
> +#ifndef __RELAY_H__
> +#define __RELAY_H__
> +
> +/* ~~~~ Relay ~~~~ */
> +struct relay {
> +       wait_queue_head_t fss_wq;
> +       bool fss_wakeup;
> +       bool fss_waiting;
> +
> +       wait_queue_head_t app_wq;
> +       bool app_wakeup;
> +       bool app_waiting;
> +
> +       cpumask_t cpus_allowed;
> +};
> +
> +static inline void relay_init(struct relay *relay)
> +{
> +       init_waitqueue_head(&relay->fss_wq);
> +       init_waitqueue_head(&relay->app_wq);
> +}
> +
> +static inline bool relay_is_app_waiting(struct relay *relay)
> +{
> +       return relay->app_waiting;
> +}
> +
> +static inline void relay_app_wakeup(struct relay *relay)
> +{
> +       relay->app_waiting = false;
> +
> +       relay->app_wakeup = true;
> +       wake_up(&relay->app_wq);
> +}
> +
> +static inline int relay_fss_wait(struct relay *relay)
> +{
> +       int err;
> +
> +       relay->fss_waiting = true;
> +       relay->fss_wakeup = false;
> +       err =  wait_event_interruptible(relay->fss_wq, relay->fss_wakeup);
> +
> +       return err;

Could you just do: "return wait_event_interruptible()" directly, instead of
using the err variable?

> +}
> +
> +static inline bool relay_is_fss_waiting_grab(struct relay *relay)
> +{
> +       if (relay->fss_waiting) {
> +               relay->fss_waiting = false;
> +               return true;
> +       }
> +       return false;
> +}
> +
> +static inline void relay_fss_wakeup(struct relay *relay)
> +{
> +       relay->fss_wakeup = true;
> +       wake_up(&relay->fss_wq);
> +}
> +
> +static inline void relay_fss_wakeup_app_wait(struct relay *relay,
> +                                            spinlock_t *spinlock)
> +{
> +       relay->app_waiting = true;
> +
> +       relay_fss_wakeup(relay);
> +
> +       relay->app_wakeup = false;
> +       if (spinlock)
> +               spin_unlock(spinlock);
> +
> +       wait_event(relay->app_wq, relay->app_wakeup);
> +}
> +
> +#endif /* ifndef __RELAY_H__ */
> diff --git a/fs/zuf/zuf-core.c b/fs/zuf/zuf-core.c
> index e12cae584f8a..95582c0a4ba5 100644
> --- a/fs/zuf/zuf-core.c
> +++ b/fs/zuf/zuf-core.c
> @@ -18,14 +18,820 @@
>  #include <linux/delay.h>
>  #include <linux/pfn_t.h>
>  #include <linux/sched/signal.h>
> +#include <linux/uaccess.h>
> 
>  #include "zuf.h"
> 
> +struct zufc_thread {
> +       struct zuf_special_file hdr;
> +       struct relay relay;
> +       struct vm_area_struct *vma;
> +       int no;
> +       int chan;
> +
> +       /* Kernel side allocated IOCTL buffer */
> +       struct vm_area_struct *opt_buff_vma;
> +       void *opt_buff;
> +       ulong max_zt_command;
> +
> +       /* Next operation*/
> +       struct zuf_dispatch_op *zdo;
> +};
> +
> +enum { INITIAL_ZT_CHANNELS = 3 };
> +
> +struct zuf_threads_pool {
> +       uint _max_zts;
> +       uint _max_channels;
> +        /* array of pcp_arrays */
> +       struct zufc_thread *_all_zt[ZUFS_MAX_ZT_CHANNELS];
> +};
> +
> +static int _alloc_zts_channel(struct zuf_root_info *zri, int channel)
> +{
> +       zri->_ztp->_all_zt[channel] = alloc_percpu(struct zufc_thread);
> +       if (unlikely(!zri->_ztp->_all_zt[channel])) {
> +               zuf_err("!!! alloc_percpu channel=%d failed\n", channel);
> +               return -ENOMEM;
> +       }
> +       return 0;
> +}
> +
> +static inline ulong _zt_pr_no(struct zufc_thread *zt)
> +{
> +       /* So in hex it will be channel as first nibble and cpu as 3rd and on
> */
> +       return ((ulong)zt->no << 8) | zt->chan;
> +}
> +
> +int zufc_zts_init(struct zuf_root_info *zri)
> +{
> +       int c;
> +
> +       zri->_ztp = kcalloc(1, sizeof(struct zuf_threads_pool), GFP_KERNEL);
> +       if (unlikely(!zri->_ztp))
> +               return -ENOMEM;
> +
> +       zri->_ztp->_max_zts = num_online_cpus();
> +       zri->_ztp->_max_channels = INITIAL_ZT_CHANNELS;
> +
> +       for (c = 0; c < INITIAL_ZT_CHANNELS; ++c) {
> +               int err = _alloc_zts_channel(zri, c);
> +
> +               if (unlikely(err))
> +                       return err;
> +       }
> +
> +       return 0;
> +}
> +
> +void zufc_zts_fini(struct zuf_root_info *zri)
> +{
> +       int c;
> +
> +       /* Always safe/must call zufc_zts_fini */
> +       if (!zri->_ztp)
> +               return;
> +
> +       for (c = 0; c < zri->_ztp->_max_channels; ++c) {
> +               if (zri->_ztp->_all_zt[c])
> +                       free_percpu(zri->_ztp->_all_zt[c]);
> +       }
> +       kfree(zri->_ztp);
> +       zri->_ztp = NULL;
> +}
> +
> +static struct zufc_thread *_zt_from_cpu(struct zuf_root_info *zri,
> +                                       int cpu, uint chan)
> +{
> +       return per_cpu_ptr(zri->_ztp->_all_zt[chan], cpu);
> +}
> +
> +static int _zt_from_f(struct file *filp, int cpu, uint chan,
> +                     struct zufc_thread **ztp)
> +{
> +       *ztp = _zt_from_cpu(ZRI(filp->f_inode->i_sb), cpu, chan);
> +       if (unlikely(!*ztp))
> +               return -ERANGE;
> +       return 0;

I'm curious if there is a reason you did it this way instead of making use of
the ERR_PTR() macro to return ztp directly?

> +}
> +
> +static int _zu_register_fs(struct file *file, void *parg)
> +{
> +       struct zufs_ioc_register_fs rfs;
> +       int err;
> +
> +       err = copy_from_user(&rfs, parg, sizeof(rfs));
> +       if (unlikely(err)) {
> +               zuf_err("=>%d\n", err);
> +               return err;
> +       }
> +
> +       err = zufr_register_fs(file->f_inode->i_sb, &rfs);
> +       if (err)
> +               zuf_err("=>%d\n", err);
> +       err = put_user(err, (int *)parg);
> +       return err;
> +}
> +
> +/* ~~~~ mounting ~~~~*/
> +int __zufc_dispatch_mount(struct zuf_root_info *zri,
> +                         enum e_mount_operation operation,
> +                         struct zufs_ioc_mount *zim)
> +{
> +       zim->hdr.operation = operation;
> +
> +       for (;;) {
> +               bool fss_waiting;
> +
> +               spin_lock(&zri->mount.lock);
> +
> +               if (unlikely(!zri->mount.zsf.file)) {
> +                       spin_unlock(&zri->mount.lock);
> +                       zuf_err("Server not up\n");
> +                       zim->hdr.err = -EIO;
> +                       return zim->hdr.err;
> +               }
> +
> +               fss_waiting = relay_is_fss_waiting_grab(&zri->mount.relay);
> +               if (fss_waiting)
> +                       break;
> +               /* in case of break above spin_unlock is done inside
> +                * relay_fss_wakeup_app_wait
> +                */
> +
> +               spin_unlock(&zri->mount.lock);
> +
> +               /* It is OK to wait if user storms mounts */
> +               zuf_dbg_verbose("waiting\n");
> +               msleep(100);
> +       }
> +
> +       zri->mount.zim = zim;
> +       relay_fss_wakeup_app_wait(&zri->mount.relay, &zri->mount.lock);
> +
> +       return zim->hdr.err;
> +}
> +
> +int zufc_dispatch_mount(struct zuf_root_info *zri, struct zus_fs_info
> *zus_zfi,
> +                       enum e_mount_operation operation,
> +                       struct zufs_ioc_mount *zim)
> +{
> +       zim->hdr.out_len = sizeof(*zim);
> +       zim->hdr.in_len = sizeof(*zim);
> +       if (operation == ZUFS_M_MOUNT || operation == ZUFS_M_REMOUNT)
> +               zim->hdr.in_len += zim->zmi.po.mount_options_len;
> +       zim->zmi.zus_zfi = zus_zfi;
> +       zim->zmi.num_cpu = zri->_ztp->_max_zts;
> +       zim->zmi.num_channels = zri->_ztp->_max_channels;
> +
> +       return __zufc_dispatch_mount(zri, operation, zim);
> +}
> +
> +static int _zu_mount(struct file *file, void *parg)
> +{
> +       struct super_block *sb = file->f_inode->i_sb;
> +       struct zuf_root_info *zri = ZRI(sb);
> +       bool waiting_for_reply;
> +       struct zufs_ioc_mount *zim;
> +       ulong cp_ret;
> +       int err;
> +
> +       spin_lock(&zri->mount.lock);
> +
> +       if (unlikely(!file->private_data)) {
> +               /* First time register this file as the mount-thread owner */
> +               zri->mount.zsf.type = zlfs_e_mout_thread;
> +               zri->mount.zsf.file = file;
> +               file->private_data = &zri->mount.zsf;
> +       } else if (unlikely(file->private_data != &zri->mount)) {
> +               spin_unlock(&zri->mount.lock);
> +               zuf_err("Say what?? %p != %p\n",
> +                       file->private_data, &zri->mount);
> +               return -EIO;
> +       }
> +
> +       zim = zri->mount.zim;
> +       zri->mount.zim = NULL;
> +       waiting_for_reply = zim && relay_is_app_waiting(&zri->mount.relay);
> +
> +       spin_unlock(&zri->mount.lock);
> +
> +       if (waiting_for_reply) {
> +               cp_ret = copy_from_user(zim, parg, zim->hdr.out_len);
> +               if (unlikely(cp_ret)) {
> +                       zuf_err("copy_from_user => %ld\n", cp_ret);
> +                        zim->hdr.err = -EFAULT;
> +               }
> +
> +               relay_app_wakeup(&zri->mount.relay);
> +       }
> +
> +       /* This gets to sleep until a mount comes */
> +       err = relay_fss_wait(&zri->mount.relay);
> +       if (unlikely(err || !zri->mount.zim)) {
> +               struct zufs_ioc_hdr *hdr = parg;
> +
> +               /* Released by _zu_break INTER or crash */
> +               zuf_dbg_zus("_zu_break? %p => %d\n", zri->mount.zim, err);
> +               put_user(ZUFS_OP_BREAK, &hdr->operation);
> +               put_user(EIO, &hdr->err);
> +               return err;
> +       }
> +
> +       zim = zri->mount.zim;
> +       cp_ret = copy_to_user(parg, zim, zim->hdr.in_len);
> +       if (unlikely(cp_ret)) {
> +               err = -EFAULT;
> +               zuf_err("copy_to_user =>%ld\n", cp_ret);
> +       }
> +       return err;
> +}
> +
> +static void zufc_mounter_release(struct file *file)
> +{
> +       struct zuf_root_info *zri = ZRI(file->f_inode->i_sb);
> +
> +       zuf_dbg_zus("closed fu=%d au=%d fw=%d aw=%d\n",
> +                 zri->mount.relay.fss_wakeup, zri->mount.relay.app_wakeup,
> +                 zri->mount.relay.fss_waiting, zri->mount.relay.app_waiting);
> +
> +       spin_lock(&zri->mount.lock);
> +       zri->mount.zsf.file = NULL;
> +       if (relay_is_app_waiting(&zri->mount.relay)) {
> +               zuf_err("server emergency exit while IO\n");
> +
> +               if (zri->mount.zim)
> +                       zri->mount.zim->hdr.err = -EIO;
> +               spin_unlock(&zri->mount.lock);
> +
> +               relay_app_wakeup(&zri->mount.relay);
> +               msleep(1000); /* crap */
> +       } else {
> +               if (zri->mount.zim)
> +                       zri->mount.zim->hdr.err = 0;
> +               spin_unlock(&zri->mount.lock);
> +       }
> +}
> +
> +/* ~~~~ ZU_IOC_NUMA_MAP ~~~~ */
> +static int _zu_numa_map(struct file *file, void *parg)
> +{
> +       struct zufs_ioc_numa_map *numa_map;
> +       int n_nodes = num_online_nodes();
> +       int n_cpus = num_online_cpus();
> +       uint *nodes_cpu_count;
> +       uint max_cpu_per_node = 0;
> +       uint alloc_size;
> +       int cpu, i, err;
> +
> +       alloc_size = sizeof(*numa_map) + n_cpus; /* char per cpu */
> +
> +       if ((n_nodes > 255) || (alloc_size > PAGE_SIZE)) {
> +               zuf_warn("!!!unexpected big machine with %d nodes
> alloc_size=0x%x\n",
> +                         n_nodes, alloc_size);
> +               return -ENOTSUPP;
> +       }
> +
> +       nodes_cpu_count = kcalloc(n_nodes, sizeof(uint), GFP_KERNEL);
> +       if (unlikely(!nodes_cpu_count))
> +               return -ENOMEM;
> +
> +       numa_map = kzalloc(alloc_size, GFP_KERNEL);
> +       if (unlikely(!numa_map)) {
> +               err = -ENOMEM;
> +               goto out;
> +       }
> +
> +       numa_map->possible_nodes        = num_possible_nodes();
> +       numa_map->possible_cpus         = num_possible_cpus();
> +
> +       numa_map->online_nodes          = n_nodes;
> +       numa_map->online_cpus           = n_cpus;
> +
> +       for_each_cpu(cpu, cpu_online_mask) {
> +               uint ctn  = cpu_to_node(cpu);
> +               uint ncc = ++nodes_cpu_count[ctn];
> +
> +               numa_map->cpu_to_node[cpu] = ctn;
> +               max_cpu_per_node = max(max_cpu_per_node, ncc);
> +       }
> +
> +       for (i = 1; i < n_nodes; ++i) {
> +               if (nodes_cpu_count[i] != nodes_cpu_count[0]) {
> +                       zuf_info("@[%d]=%d Unbalanced CPU sockets @[0]=%d\n",
> +                                 i, nodes_cpu_count[i], nodes_cpu_count[0]);
> +                       numa_map->nodes_not_symmetrical = true;
> +                       break;
> +               }
> +       }
> +
> +       numa_map->max_cpu_per_node = max_cpu_per_node;
> +
> +       zuf_dbg_verbose(
> +               "possible_nodes=%d possible_cpus=%d online_nodes=%d
> online_cpus=%d\n",
> +               numa_map->possible_nodes, numa_map->possible_cpus,
> +               n_nodes, n_cpus);
> +
> +       err = copy_to_user(parg, numa_map, alloc_size);
> +       kfree(numa_map);
> +out:
> +       kfree(nodes_cpu_count);
> +       return err;
> +}
> +
> +static int _map_pages(struct zufc_thread *zt, struct page **pages, uint nump,
> +                     bool map_readonly)
> +{
> +       int p, err;
> +
> +       if (!(zt->vma && pages && nump))
> +               return 0;
> +
> +       for (p = 0; p < nump; ++p) {
> +               ulong zt_addr = zt->vma->vm_start + p * PAGE_SIZE;
> +               ulong pfn = page_to_pfn(pages[p]);
> +               pfn_t pfnt = phys_to_pfn_t(PFN_PHYS(pfn), PFN_MAP | PFN_DEV);
> +               vm_fault_t flt;
> +
> +               if (map_readonly)
> +                       flt = vmf_insert_mixed(zt->vma, zt_addr, pfnt);
> +               else
> +                       flt = vmf_insert_mixed_mkwrite(zt->vma, zt_addr,
> pfnt);
> +               err = zuf_flt_to_err(flt);
> +               if (unlikely(err)) {
> +                       zuf_err("zuf: remap_pfn_range => %d p=0x%x
> start=0x%lx\n",
> +                                err, p, zt->vma->vm_start);
> +                       return err;
> +               }
> +       }
> +       return 0;
> +}
> +
> +static void _unmap_pages(struct zufc_thread *zt, struct page **pages, uint
> nump)
> +{
> +       if (!(zt->vma && zt->zdo && pages && nump))
> +               return;
> +
> +       zt->zdo->pages = NULL;
> +       zt->zdo->nump = 0;
> +
> +       zap_vma_ptes(zt->vma, zt->vma->vm_start, nump * PAGE_SIZE);
> +}
> +
> +static void _fill_buff(ulong *buff, uint size)
> +{
> +       ulong *buff_end = buff + size;
> +       ulong val = 0;
> +
> +       for (; buff < buff_end; ++buff, ++val)
> +               *buff = val;
> +}
> +
> +static int _zu_init(struct file *file, void *parg)
> +{
> +       struct zufc_thread *zt;
> +       int cpu = smp_processor_id();
> +       struct zufs_ioc_init zi_init;
> +       int err;
> +
> +       err = copy_from_user(&zi_init, parg, sizeof(zi_init));
> +       if (unlikely(err)) {
> +               zuf_err("=>%d\n", err);
> +               return err;
> +       }
> +       if (unlikely(zi_init.channel_no >= ZUFS_MAX_ZT_CHANNELS)) {
> +               zuf_err("[%d] channel_no=%d\n", cpu, zi_init.channel_no);
> +               return -EINVAL;
> +       }
> +
> +       zuf_dbg_zus("[%d] aff=0x%lx channel=%d\n",
> +                   cpu, zi_init.affinity, zi_init.channel_no);
> +
> +       zi_init.hdr.err = _zt_from_f(file, cpu, zi_init.channel_no, &zt);
> +       if (unlikely(zi_init.hdr.err)) {
> +               zuf_err("=>%d\n", err);
> +               goto out;
> +       }
> +
> +       if (unlikely(zt->hdr.file)) {
> +               zi_init.hdr.err = -EINVAL;
> +               zuf_err("[%d] !!! thread already set\n", cpu);
> +               goto out;
> +       }
> +
> +       relay_init(&zt->relay);
> +       zt->hdr.type = zlfs_e_zt;
> +       zt->hdr.file = file;
> +       zt->no = cpu;
> +       zt->chan = zi_init.channel_no;
> +
> +       zt->max_zt_command = zi_init.max_command;
> +       zt->opt_buff = vmalloc(zi_init.max_command);
> +       if (unlikely(!zt->opt_buff)) {
> +               zi_init.hdr.err = -ENOMEM;
> +               goto out;
> +       }
> +       _fill_buff(zt->opt_buff, zi_init.max_command / sizeof(ulong));
> +
> +       file->private_data = &zt->hdr;
> +out:
> +       err = copy_to_user(parg, &zi_init, sizeof(zi_init));
> +       if (err)
> +               zuf_err("=>%d\n", err);
> +       return err;
> +}
> +
> +struct zufc_thread *_zt_from_f_private(struct file *file)
> +{
> +       struct zuf_special_file *zsf = file->private_data;
> +
> +       WARN_ON(zsf->type != zlfs_e_zt);
> +       return container_of(zsf, struct zufc_thread, hdr);
> +}
> +
> +/* Caller checks that file->private_data != NULL */
> +static void zufc_zt_release(struct file *file)
> +{
> +       struct zufc_thread *zt = _zt_from_f_private(file);
> +
> +       if (unlikely(zt->hdr.file != file))
> +               zuf_err("What happened zt->file(%p) != file(%p)\n",
> +                       zt->hdr.file, file);
> +
> +       zuf_dbg_zus("[%d] closed fu=%d au=%d fw=%d aw=%d\n",
> +                 zt->no, zt->relay.fss_wakeup, zt->relay.app_wakeup,
> +                 zt->relay.fss_waiting, zt->relay.app_waiting);
> +
> +       if (relay_is_app_waiting(&zt->relay)) {
> +               zuf_err("server emergency exit while IO\n");
> +
> +               /* NOTE: Do not call _unmap_pages the vma is gone */
> +               zt->hdr.file = NULL;
> +
> +               relay_app_wakeup(&zt->relay);
> +               msleep(1000); /* crap */
> +       }
> +
> +       vfree(zt->opt_buff);
> +       memset(zt, 0, sizeof(*zt));
> +}
> +
> +static int _copy_outputs(struct zufc_thread *zt, void *arg)
> +{
> +       struct zufs_ioc_hdr *hdr = zt->zdo->hdr;
> +       struct zufs_ioc_hdr *user_hdr = zt->opt_buff;
> +
> +       if (zt->opt_buff_vma->vm_start != (ulong)arg) {
> +               zuf_err("malicious Server\n");
> +               return -EINVAL;
> +       }
> +
> +       /* Update on the user out_len and return-code */
> +       hdr->err = user_hdr->err;
> +       hdr->out_len = user_hdr->out_len;
> +
> +       if (!hdr->out_len)
> +               return 0;
> +
> +       if ((hdr->err == -EZUFS_RETRY) || (hdr->out_max < hdr->out_len)) {
> +               if (WARN_ON(!zt->zdo->oh)) {
> +                       zuf_err("Trouble op(%s) out_max=%d out_len=%d\n",
> +                               zuf_op_name(hdr->operation),
> +                               hdr->out_max, hdr->out_len);
> +                       return -EFAULT;
> +               }
> +               zuf_dbg_zus("[%s] %d %d => %d\n",
> +                           zuf_op_name(hdr->operation),
> +                           hdr->out_max, hdr->out_len, hdr->err);
> +               return zt->zdo->oh(zt->zdo, zt->opt_buff, zt->max_zt_command);
> +       } else {
> +               void *rply = (void *)hdr + hdr->out_start;
> +               void *from = zt->opt_buff + hdr->out_start;
> +
> +               memcpy(rply, from, hdr->out_len);
> +               return 0;
> +       }
> +}
> +
> +static int _zu_wait(struct file *file, void *parg)
> +{
> +       struct zufc_thread *zt;
> +       int err;
> +
> +       zt = _zt_from_f_private(file);
> +       if (unlikely(!zt)) {
> +               zuf_err("Unexpected ZT state\n");
> +               err = -ERANGE;
> +               goto err;
> +       }
> +
> +       if (!zt->hdr.file || file != zt->hdr.file) {
> +               zuf_err("fatal\n");
> +               err = -E2BIG;
> +               goto err;
> +       }
> +       if (unlikely((ulong)parg != zt->opt_buff_vma->vm_start)) {
> +               zuf_err("fatal 2\n");
> +               err = -EINVAL;
> +               goto err;
> +       }
> +
> +       if (relay_is_app_waiting(&zt->relay)) {
> +               if (unlikely(!zt->zdo)) {
> +                       zuf_err("User has gone...\n");
> +                       err = -E2BIG;
> +                       goto err;
> +               } else {
> +                       /* overflow_handler might decide to execute the
> +                        *parg here at zus context and return to server
> +                        * If it also has an error to report to zus it
> +                        * will set zdo->hdr->err.
> +                        * EZUS_RETRY_DONE is when that happens.
> +                        * In this case pages stay mapped in zt->vma
> +                        */
> +                       err = _copy_outputs(zt, parg);
> +                       if (err == EZUF_RETRY_DONE) {
> +                               put_user(zt->zdo->hdr->err, (int *)parg);
> +                               return 0;
> +                       }
> +
> +                       _unmap_pages(zt, zt->zdo->pages, zt->zdo->nump);
> +                       zt->zdo = NULL;
> +                       if (unlikely(err)) /* _copy_outputs returned an err */
> +                               goto err;
> +               }
> +               relay_app_wakeup(&zt->relay);
> +       }
> +
> +       err = relay_fss_wait(&zt->relay);
> +       if (err)
> +               zuf_dbg_err("[%d] relay error: %d\n", zt->no, err);
> +
> +       if (zt->zdo &&  zt->zdo->hdr &&
> +           zt->zdo->hdr->operation < ZUFS_OP_BREAK) {
> +               /* call map here at the zuf thread so we need no locks
> +                * TODO: Currently only ZUFS_OP_WRITE protects user-buffers
> +                * we should have a bit set in zt->zdo->hdr set per operation.
> +                * TODO: Why this does not work?
> +                */
> +               _map_pages(zt, zt->zdo->pages, zt->zdo->nump, 0);
> +               memcpy(zt->opt_buff, zt->zdo->hdr, zt->zdo->hdr->in_len);
> +       } else {
> +               struct zufs_ioc_hdr *hdr = zt->opt_buff;
> +
> +               /* This Means we were released by _zu_break */
> +               zuf_dbg_zus("_zu_break? => %d\n", err);
> +               hdr->operation = ZUFS_OP_BREAK;
> +               hdr->err = err;
> +       }
> +
> +       return err;
> +
> +err:
> +       put_user(err, (int *)parg);
> +       return err;
> +}
> +
> +static int _try_grab_zt_channel(struct zuf_root_info *zri, int cpu,
> +                                struct zufc_thread **ztp)
> +{
> +       struct zufc_thread *zt;
> +       int c;
> +
> +       for (c = 0; ; ++c) {
> +               zt = _zt_from_cpu(zri, cpu, c);
> +               if (unlikely(!zt || !zt->hdr.file))
> +                       break;
> +
> +               if (relay_is_fss_waiting_grab(&zt->relay)) {
> +                       *ztp = zt;
> +                       return true;
> +               }
> +       }
> +
> +       *ztp = _zt_from_cpu(zri, cpu, 0);
> +       return false;
> +}
> +
> +#define _zuf_get_cpu() get_cpu()
> +#define _zuf_put_cpu() put_cpu()
> +
> +#ifdef CONFIG_ZUF_DEBUG
> +static
> +int _r_zufs_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo)
> +#else
> +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo)
> +#endif
> +{
> +       struct task_struct *app = get_current();
> +       struct zufs_ioc_hdr *hdr = zdo->hdr;
> +       int cpu, cpu2;
> +       struct zufc_thread *zt;
> +
> +       if (unlikely(hdr->out_len && !hdr->out_max)) {
> +               /* TODO: Complain here and let caller code do this proper */
> +               hdr->out_max = hdr->out_len;
> +       }
> +
> +channel_busy:
> +       cpu = _zuf_get_cpu();
> +
> +       if (!_try_grab_zt_channel(zri, cpu, &zt)) {
> +               _zuf_put_cpu();
> +
> +               /* If channel was grabbed then maybe a break_all is in
> progress
> +                * on a different CPU make sure zt->file on this core is
> +                * updated
> +                */
> +               mb();
> +               if (unlikely(!zt->hdr.file)) {
> +                       zuf_err("[%d] !zt->file\n", cpu);
> +                       return -EIO;
> +               }
> +               zuf_dbg_err("[%d] can this be\n", cpu);
> +               /* FIXME: Do something much smarter */
> +               msleep(10);
> +               if (signal_pending(get_current())) {
> +                       zuf_dbg_err("[%d] => EINTR\n", cpu);
> +                       return -EINTR;
> +               }
> +               goto channel_busy;
> +       }
> +
> +       /* lock app to this cpu while waiting */
> +       cpumask_copy(&zt->relay.cpus_allowed, &app->cpus_allowed);
> +       cpumask_copy(&app->cpus_allowed,  cpumask_of(smp_processor_id()));
> +
> +       zt->zdo = zdo;
> +
> +       _zuf_put_cpu();
> +
> +       relay_fss_wakeup_app_wait(&zt->relay, NULL);
> +
> +       /* restore cpu affinity after wakeup */
> +       cpumask_copy(&app->cpus_allowed, &zt->relay.cpus_allowed);
> +
> +cpu2 = smp_processor_id();
> +if (cpu2 != cpu)
> +       zuf_warn("App switched cpu1=%u cpu2=%u\n", cpu, cpu2);
> +
> +       return zt->hdr.file ? hdr->err : -EIO;
> +}
> +
> +const char *zuf_op_name(enum e_zufs_operation op)
> +{
> +#define CASE_ENUM_NAME(e) case e: return #e
> +       switch  (op) {
> +               CASE_ENUM_NAME(ZUFS_OP_BREAK            );
> +       default:
> +               return "UNKNOWN";
> +       }
> +}
> +
> +#ifdef CONFIG_ZUF_DEBUG
> +
> +#define MAX_ZT_SEC 5
> +int __zufc_dispatch(struct zuf_root_info *zri, struct zuf_dispatch_op *zdo)
> +{
> +       u64 t1, t2;
> +       int err;
> +
> +       t1 = ktime_get_ns();
> +       err = _r_zufs_dispatch(zri, zdo);
> +       t2 = ktime_get_ns();
> +
> +       if ((t2 - t1) > MAX_ZT_SEC * NSEC_PER_SEC)
> +               zuf_err("zufc_dispatch(%s, [0x%x-0x%x]) took %lld sec\n",
> +                       zuf_op_name(zdo->hdr->operation), zdo->hdr->offset,
> +                       zdo->hdr->len,
> +                       (t2 - t1) / NSEC_PER_SEC);
> +
> +       return err;
> +}
> +#endif /* def CONFIG_ZUF_DEBUG */
> +
> +/* ~~~ iomap_exec && exec_buffer allocation ~~~ */
> +struct zu_exec_buff {
> +       struct zuf_special_file hdr;
> +       struct vm_area_struct *vma;
> +       void *opt_buff;
> +       ulong alloc_size;
> +};
> +
> +/* Do some common checks and conversions */
> +static inline struct zu_exec_buff *_ebuff_from_file(struct file *file)
> +{
> +       struct zu_exec_buff *ebuff = file->private_data;
> +
> +       if (WARN_ON_ONCE(ebuff->hdr.type != zlfs_e_dpp_buff)) {
> +               zuf_err("Must call ZU_IOC_ALLOC_BUFFER first\n");
> +               return NULL;
> +       }
> +
> +       if (WARN_ON_ONCE(ebuff->hdr.file != file))
> +               return NULL;
> +
> +       return ebuff;
> +}
> +
> +static int _zu_ebuff_alloc(struct file *file, void *arg)
> +{
> +       struct zufs_ioc_alloc_buffer ioc_alloc;
> +       struct zu_exec_buff *ebuff;
> +       int err;
> +
> +       err = copy_from_user(&ioc_alloc, arg, sizeof(ioc_alloc));
> +       if (unlikely(err)) {
> +               zuf_err("=>%d\n", err);
> +               return err;
> +       }
> +
> +       if (ioc_alloc.init_size > ioc_alloc.max_size)
> +               return -EINVAL;
> +
> +       /* TODO: Easily Support growing */
> +       /* TODO: Support global pools, also easy */
> +       if (ioc_alloc.pool_no || ioc_alloc.init_size != ioc_alloc.max_size)
> +               return -ENOTSUPP;
> +
> +       ebuff = kzalloc(sizeof(*ebuff), GFP_KERNEL);
> +       if (unlikely(!ebuff))
> +               return -ENOMEM;
> +
> +       ebuff->hdr.type = zlfs_e_dpp_buff;
> +       ebuff->hdr.file = file;
> +       i_size_write(file->f_inode, ioc_alloc.max_size);
> +       ebuff->alloc_size =  ioc_alloc.init_size;
> +       ebuff->opt_buff = vmalloc(ioc_alloc.init_size);
> +       if (unlikely(!ebuff->opt_buff)) {
> +               kfree(ebuff);
> +               return -ENOMEM;
> +       }
> +       _fill_buff(ebuff->opt_buff, ioc_alloc.init_size / sizeof(ulong));
> +
> +       file->private_data = &ebuff->hdr;
> +       return 0;
> +}
> +
> +static void zufc_ebuff_release(struct file *file)
> +{
> +       struct zu_exec_buff *ebuff = _ebuff_from_file(file);
> +
> +       if (unlikely(!ebuff))
> +               return;
> +
> +       vfree(ebuff->opt_buff);
> +       ebuff->hdr.type = 0;
> +       ebuff->hdr.file = NULL; /* for none-dbg Kernels && use-after-free */
> +       kfree(ebuff);
> +}
> +
> +static int _zu_break(struct file *filp, void *parg)
> +{
> +       struct zuf_root_info *zri = ZRI(filp->f_inode->i_sb);
> +       int i, c;
> +
> +       zuf_dbg_core("enter\n");
> +       mb(); /* TODO how to schedule on all CPU's */
> +
> +       for (i = 0; i < zri->_ztp->_max_zts; ++i) {
> +               for (c = 0; c < zri->_ztp->_max_channels; ++c) {
> +                       struct zufc_thread *zt = _zt_from_cpu(zri, i, c);
> +
> +                       if (unlikely(!(zt && zt->hdr.file)))
> +                               continue;
> +                       relay_fss_wakeup(&zt->relay);
> +               }
> +       }
> +
> +       if (zri->mount.zsf.file)
> +               relay_fss_wakeup(&zri->mount.relay);
> +
> +       zuf_dbg_core("exit\n");
> +       return 0;
> +}
> +
>  long zufc_ioctl(struct file *file, unsigned int cmd, ulong arg)
>  {
> +       void __user *parg = (void __user *)arg;
> +
>         switch (cmd) {
> +       case ZU_IOC_REGISTER_FS:
> +               return _zu_register_fs(file, parg);
> +       case ZU_IOC_MOUNT:
> +               return _zu_mount(file, parg);
> +       case ZU_IOC_NUMA_MAP:
> +               return _zu_numa_map(file, parg);
> +       case ZU_IOC_INIT_THREAD:
> +               return _zu_init(file, parg);
> +       case ZU_IOC_WAIT_OPT:
> +               return _zu_wait(file, parg);
> +       case ZU_IOC_ALLOC_BUFFER:
> +               return _zu_ebuff_alloc(file, parg);
> +       case ZU_IOC_BREAK_ALL:
> +               return _zu_break(file, parg);
>         default:
> -               zuf_err("%d\n", cmd);
> +               zuf_err("%d %ld\n", cmd, ZU_IOC_WAIT_OPT);
>                 return -ENOTTY;
>         }
>  }
> @@ -38,11 +844,215 @@ int zufc_release(struct inode *inode, struct file *file)
>                 return 0;
> 
>         switch (zsf->type) {
> +       case zlfs_e_zt:
> +               zufc_zt_release(file);
> +               return 0;
> +       case zlfs_e_mout_thread:
> +               zufc_mounter_release(file);
> +               return 0;
> +       case zlfs_e_pmem:
> +               /* NOTHING to clean for pmem file yet */
> +               /* zuf_pmem_release(file);*/
> +               return 0;
> +       case zlfs_e_dpp_buff:
> +               zufc_ebuff_release(file);
> +               return 0;
>         default:
>                 return 0;
>         }
>  }
> 
> +/* ~~~~  mmap area of app buffers into server ~~~~ */
> +
> +static int zuf_zt_fault(struct vm_fault *vmf)
> +{
> +       zuf_err("should not fault\n");
> +       return VM_FAULT_SIGBUS;
> +}
> +
> +static const struct vm_operations_struct zuf_vm_ops = {
> +       .fault          = zuf_zt_fault,
> +};
> +
> +static int _zufc_zt_mmap(struct file *file, struct vm_area_struct *vma,
> +                        struct zufc_thread *zt)
> +{
> +       /* Tell Kernel We will only access on a single core */
> +       vma->vm_flags |= VM_MIXEDMAP;
> +       vma->vm_ops = &zuf_vm_ops;
> +
> +       zt->vma = vma;
> +
> +       zuf_dbg_core(
> +               "[0x%lx] start=0x%lx end=0x%lx flags=0x%lx file-
> start=0x%lx\n",
> +               _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_flags,
> +               vma->vm_pgoff);
> +
> +       return 0;
> +}
> +
> +/* ~~~~  mmap the Kernel allocated IOCTL buffer per ZT ~~~~ */
> +static int _opt_buff_mmap(struct vm_area_struct *vma, void *opt_buff,
> +                         ulong opt_size)
> +{
> +       ulong offset;
> +
> +       if (!opt_buff)
> +               return -ENOMEM;
> +
> +       for (offset = 0; offset < opt_size; offset += PAGE_SIZE) {
> +               ulong addr = vma->vm_start + offset;
> +               ulong pfn = vmalloc_to_pfn(opt_buff +  offset);
> +               pfn_t pfnt = phys_to_pfn_t(PFN_PHYS(pfn), PFN_MAP | PFN_DEV);
> +               int err;
> +
> +               zuf_dbg_verbose("[0x%lx] pfn-0x%lx addr=0x%lx buff=0x%lx\n",
> +                               offset, pfn, addr, (ulong)opt_buff + offset);
> +
> +               err = zuf_flt_to_err(vmf_insert_mixed_mkwrite(vma, addr,
> pfnt));
> +               if (unlikely(err)) {
> +                       zuf_err("zuf: zuf_insert_mixed_mkwrite => %d
> offset=0x%lx addr=0x%lx\n",
> +                                err, offset, addr);
> +                       return err;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static int zuf_obuff_fault(struct vm_fault *vmf)
> +{
> +       struct vm_area_struct *vma = vmf->vma;
> +       struct zufc_thread *zt = _zt_from_f_private(vma->vm_file);
> +       long offset = (vmf->pgoff << PAGE_SHIFT) - ZUS_API_MAP_MAX_SIZE;
> +       int err;
> +
> +       zuf_dbg_core(
> +               "[0x%lx] start=0x%lx end=0x%lx file-start=0x%lx
> offset=0x%lx\n",
> +               _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_pgoff,
> +               offset);
> +
> +       /* if Server overruns its buffer crash it dead */
> +       if (unlikely((offset < 0) || (zt->max_zt_command < offset))) {
> +               zuf_err("[0x%lx] start=0x%lx end=0x%lx file-start=0x%lx
> offset=0x%lx\n",
> +                       _zt_pr_no(zt), vma->vm_start,
> +                       vma->vm_end, vma->vm_pgoff, offset);
> +               return VM_FAULT_SIGBUS;
> +       }
> +
> +       /* We never released a zus-core.c that does not fault the
> +        * first page first. I want to see if this happens
> +        */
> +       if (unlikely(offset))
> +               zuf_warn("Suspicious server activity\n");
> +
> +       /* This faults only once at very first access */
> +       err = _opt_buff_mmap(vma, zt->opt_buff, zt->max_zt_command);
> +       if (unlikely(err))
> +               return VM_FAULT_SIGBUS;
> +
> +       return VM_FAULT_NOPAGE;
> +}
> +
> +static const struct vm_operations_struct zuf_obuff_ops = {
> +       .fault          = zuf_obuff_fault,
> +};
> +
> +static int _zufc_obuff_mmap(struct file *file, struct vm_area_struct *vma,
> +                           struct zufc_thread *zt)
> +{
> +       vma->vm_flags |= VM_MIXEDMAP;
> +       vma->vm_ops = &zuf_obuff_ops;
> +
> +       zt->opt_buff_vma = vma;
> +
> +       zuf_dbg_core(
> +               "[0x%lx] start=0x%lx end=0x%lx flags=0x%lx file-
> start=0x%lx\n",
> +               _zt_pr_no(zt), vma->vm_start, vma->vm_end, vma->vm_flags,
> +               vma->vm_pgoff);
> +
> +       return 0;
> +}
> +
> +/* ~~~ */
> +
> +static int zufc_zt_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +       struct zufc_thread *zt = _zt_from_f_private(file);
> +
> +       /* We have two areas of mmap in this special file.
> +        * 0 to ZUS_API_MAP_MAX_SIZE:
> +        *      The first part where app pages are mapped
> +        *      into server per operation.
> +        * ZUS_API_MAP_MAX_SIZE of size zuf_root_info->max_zt_command
> +        *      Is where we map the per ZT ioctl-buffer, later passed
> +        *      to the zus_ioc_wait IOCTL call
> +        */
> +       if (vma->vm_pgoff == ZUS_API_MAP_MAX_SIZE / PAGE_SIZE)
> +               return _zufc_obuff_mmap(file, vma, zt);
> +
> +       /* zuf ZT API is very particular about where in its
> +        * special file we communicate
> +        */
> +       if (unlikely(vma->vm_pgoff))
> +               return -EINVAL;
> +
> +       return _zufc_zt_mmap(file, vma, zt);
> +}
> +
> +/* ~~~~ Implementation of the ZU_IOC_ALLOC_BUFFER mmap facility ~~~~ */
> +
> +static int zuf_ebuff_fault(struct vm_fault *vmf)
> +{
> +       struct vm_area_struct *vma = vmf->vma;
> +       struct zu_exec_buff *ebuff = _ebuff_from_file(vma->vm_file);
> +       long offset = (vmf->pgoff << PAGE_SHIFT);
> +       int err;
> +
> +       zuf_dbg_core("start=0x%lx end=0x%lx file-start=0x%lx file-
> off=0x%lx\n",
> +                    vma->vm_start, vma->vm_end, vma->vm_pgoff, offset);
> +
> +       /* if Server overruns its buffer crash it dead */
> +       if (unlikely((offset < 0) || (ebuff->alloc_size < offset))) {
> +               zuf_err("start=0x%lx end=0x%lx file-start=0x%lx file-
> off=0x%lx\n",
> +                       vma->vm_start, vma->vm_end, vma->vm_pgoff,
> +                       offset);
> +               return VM_FAULT_SIGBUS;
> +       }
> +
> +       /* We never released a zus-core.c that does not fault the
> +        * first page first. I want to see if this happens
> +        */
> +       if (unlikely(offset))
> +               zuf_warn("Suspicious server activity\n");
> +
> +       /* This faults only once at very first access */
> +       err = _opt_buff_mmap(vma, ebuff->opt_buff, ebuff->alloc_size);
> +       if (unlikely(err))
> +               return VM_FAULT_SIGBUS;
> +
> +       return VM_FAULT_NOPAGE;
> +}
> +
> +static const struct vm_operations_struct zuf_ebuff_ops = {
> +       .fault          = zuf_ebuff_fault,
> +};
> +
> +static int zufc_ebuff_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +       struct zu_exec_buff *ebuff = _ebuff_from_file(vma->vm_file);
> +
> +       vma->vm_flags |= VM_MIXEDMAP;
> +       vma->vm_ops = &zuf_ebuff_ops;
> +
> +       ebuff->vma = vma;
> +
> +       zuf_dbg_core("start=0x%lx end=0x%lx flags=0x%lx file-start=0x%lx\n",
> +                     vma->vm_start, vma->vm_end, vma->vm_flags, vma-
> >vm_pgoff);
> +
> +       return 0;
> +}
> +
>  int zufc_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>         struct zuf_special_file *zsf = file->private_data;
> @@ -53,6 +1063,10 @@ int zufc_mmap(struct file *file, struct vm_area_struct
> *vma)
>         }
> 
>         switch (zsf->type) {
> +       case zlfs_e_zt:
> +               return zufc_zt_mmap(file, vma);
> +       case zlfs_e_dpp_buff:
> +               return zufc_ebuff_mmap(file, vma);
>         default:
>                 zuf_err("type=%d\n", zsf->type);
>                 return -ENOTTY;
> diff --git a/fs/zuf/zuf-root.c b/fs/zuf/zuf-root.c
> index 55a839dbc854..37b70ca33d3c 100644
> --- a/fs/zuf/zuf-root.c
> +++ b/fs/zuf/zuf-root.c
> @@ -227,6 +227,7 @@ static void zufr_put_super(struct super_block *sb)
>  {
>         struct zuf_root_info *zri = ZRI(sb);
> 
> +       zufc_zts_fini(zri);
>         _unregister_all_fses(zri);
> 
>         zuf_info("zuf_root umount\n");
> @@ -282,10 +283,16 @@ static int zufr_fill_super(struct super_block *sb, void
> *data, int silent)
>         root_i->i_fop = &zufr_file_dir_operations;
>         root_i->i_op = &zufr_inode_operations;
> 
> +       spin_lock_init(&zri->mount.lock);
>         mutex_init(&zri->sbl_lock);
> +       relay_init(&zri->mount.relay);
>         INIT_LIST_HEAD(&zri->fst_list);
>         INIT_LIST_HEAD(&zri->pmem_list);
> 
> +       err = zufc_zts_init(zri);
> +       if (unlikely(err))
> +               return err; /* put will be called we have a root */
> +
>         return 0;
>  }
> 
> diff --git a/fs/zuf/zuf.h b/fs/zuf/zuf.h
> index f979d8cbe60c..a33f5908155d 100644
> --- a/fs/zuf/zuf.h
> +++ b/fs/zuf/zuf.h
> @@ -23,9 +23,11 @@
>  #include <linux/xattr.h>
>  #include <linux/exportfs.h>
>  #include <linux/page_ref.h>
> +#include <linux/mm.h>
> 
>  #include "zus_api.h"
> 
> +#include "relay.h"
>  #include "_pr.h"
> 
>  enum zlfs_e_special_file {
> @@ -44,6 +46,8 @@ struct zuf_special_file {
>  struct zuf_root_info {
>         struct __mount_thread_info {
>                 struct zuf_special_file zsf;
> +               spinlock_t lock;
> +               struct relay relay;
>                 struct zufs_ioc_mount *zim;
>         } mount;
> 
> @@ -102,6 +106,48 @@ static inline struct zuf_inode_info *ZUII(struct inode
> *inode)
>         return container_of(inode, struct zuf_inode_info, vfs_inode);
>  }
> 
> +static inline struct zuf_fs_type *ZUF_FST(struct file_system_type *fs_type)
> +{
> +       return container_of(fs_type, struct zuf_fs_type, vfs_fst);
> +}
> +
> +static inline struct zuf_fs_type *zuf_fst(struct super_block *sb)
> +{
> +       return ZUF_FST(sb->s_type);
> +}
> +
> +struct zuf_dispatch_op;
> +typedef int (*overflow_handler)(struct zuf_dispatch_op *zdo, void *parg,
> +                               ulong zt_max_bytes);
> +struct zuf_dispatch_op {
> +       struct zufs_ioc_hdr *hdr;
> +       struct page **pages;
> +       uint nump;
> +       overflow_handler oh;
> +       struct super_block *sb;
> +       struct inode *inode;
> +};
> +
> +static inline void
> +zuf_dispatch_init(struct zuf_dispatch_op *zdo, struct zufs_ioc_hdr *hdr,
> +                struct page **pages, uint nump)
> +{
> +       memset(zdo, 0, sizeof(*zdo));
> +       zdo->hdr = hdr;
> +       zdo->pages = pages; zdo->nump = nump;
> +}
> +
> +static inline int zuf_flt_to_err(vm_fault_t flt)
> +{
> +       if (likely(flt == VM_FAULT_NOPAGE))
> +               return 0;
> +
> +       if (flt == VM_FAULT_OOM)
> +               return -ENOMEM;
> +
> +       return -EACCES;
> +}
> +
>  /* Keep this include last thing in file */
>  #include "_extern.h"
> 
> diff --git a/fs/zuf/zus_api.h b/fs/zuf/zus_api.h
> index 34e3e1a9a107..3319a70b5ccc 100644
> --- a/fs/zuf/zus_api.h
> +++ b/fs/zuf/zus_api.h
> @@ -66,6 +66,47 @@
> 
>  #endif /*  ndef __KERNEL__ */
> 
> +/* first available error code after include/linux/errno.h */
> +#define EZUFS_RETRY    531
> +
> +/* The below is private to zuf Kernel only. Is not exposed to VFS nor zus
> + * (defined here to allocate the constant)
> + */
> +#define EZUF_RETRY_DONE 540
> +
> +/**
> + * zufs dual port memory
> + * This is a special type of offset to either memory or persistent-memory,
> + * that is designed to be used in the interface mechanism between userspace
> + * and kernel, and can be accessed by both.
> + * 3 first bits denote a mem-pool:
> + * 0   - pmem pool
> + * 1-6 - established shared pool by a call to zufs_ioc_create_mempool (below)
> + * 7   - offset into app memory
> + */
> +typedef __u64 __bitwise zu_dpp_t;
> +
> +static inline uint zu_dpp_t_pool(zu_dpp_t t)
> +{
> +       return t & 0x7;
> +}
> +
> +static inline ulong zu_dpp_t_val(zu_dpp_t t)
> +{
> +       return t & ~0x7;
> +}
> +
> +static inline zu_dpp_t enc_zu_dpp_t(ulong v, uint pool)
> +{
> +       return v | pool;
> +}
> +
> +/* ~~~~~ ZUFS API ioctl commands ~~~~~ */
> +enum {
> +       ZUS_API_MAP_MAX_PAGES   = 1024,
> +       ZUS_API_MAP_MAX_SIZE    = ZUS_API_MAP_MAX_PAGES * PAGE_SIZE,
> +};
> +
>  struct zufs_ioc_hdr {
>         __u32 err;      /* IN/OUT must be first */
>         __u16 in_len;   /* How much to be copied *to* zus */
> @@ -102,4 +143,148 @@ struct zufs_ioc_register_fs {
>  };
>  #define ZU_IOC_REGISTER_FS     _IOWR('Z', 10, struct zufs_ioc_register_fs)
> 
> +/* A cookie from user-mode returned by mount */
> +struct zus_sb_info;
> +
> +/* zus cookie per inode */
> +struct zus_inode_info;
> +
> +enum ZUFS_M_FLAGS {
> +       ZUFS_M_PEDANTIC         = 0x00000001,
> +       ZUFS_M_EPHEMERAL        = 0x00000002,
> +       ZUFS_M_SILENT           = 0x00000004,
> +};
> +
> +struct zufs_parse_options {
> +       __u32 mount_options_len;
> +       __u32 pedantic;
> +       __u64 mount_flags;
> +       char mount_options[0];
> +};
> +
> +enum e_mount_operation {
> +       ZUFS_M_MOUNT    = 1,
> +       ZUFS_M_UMOUNT,
> +       ZUFS_M_REMOUNT,
> +       ZUFS_M_DDBG_RD,
> +       ZUFS_M_DDBG_WR,
> +};
> +
> +struct zufs_mount_info {
> +       /* IN */
> +       struct zus_fs_info *zus_zfi;
> +       __u16   num_cpu;
> +       __u16   num_channels;
> +       __u32   pmem_kern_id;
> +       __u64   sb_id;
> +
> +       /* OUT */
> +       struct zus_sb_info *zus_sbi;
> +       /* mount is also iget of root */
> +       struct zus_inode_info *zus_ii;
> +       zu_dpp_t _zi;
> +       __u64   old_mount_opt;
> +       __u64   remount_flags;
> +
> +       /* More FS specific info */
> +       __u32 s_blocksize_bits;
> +       __u8    acl_on;
> +       struct zufs_parse_options po;
> +};
> +
> +/* mount / umount */
> +struct  zufs_ioc_mount {
> +       struct zufs_ioc_hdr hdr;
> +       struct zufs_mount_info zmi;
> +};
> +#define ZU_IOC_MOUNT   _IOWR('Z', 11, struct zufs_ioc_mount)
> +
> +/* pmem  */
> +struct zufs_ioc_numa_map {
> +       /* Set by zus */
> +       struct zufs_ioc_hdr hdr;
> +
> +       __u32   possible_nodes;
> +       __u32   possible_cpus;
> +       __u32   online_nodes;
> +       __u32   online_cpus;
> +
> +       __u32   max_cpu_per_node;
> +
> +       /* This indicates that NOT all nodes have @max_cpu_per_node cpus */
> +       bool    nodes_not_symmetrical;
> +
> +       /* Variable size must keep last
> +        * size @online_cpus
> +        */
> +       __u8    cpu_to_node[];
> +};
> +#define ZU_IOC_NUMA_MAP        _IOWR('Z', 12, struct zufs_ioc_numa_map)
> +
> +/* ZT init */
> +enum { ZUFS_MAX_ZT_CHANNELS = 64 };
> +
> +struct zufs_ioc_init {
> +       struct zufs_ioc_hdr hdr;
> +       ulong affinity; /* IN */
> +       uint channel_no;
> +       uint max_command;
> +};
> +#define ZU_IOC_INIT_THREAD     _IOWR('Z', 14, struct zufs_ioc_init)
> +
> +/* break_all (Server telling kernel to clean) */
> +struct zufs_ioc_break_all {
> +       struct zufs_ioc_hdr hdr;
> +};
> +#define ZU_IOC_BREAK_ALL       _IOWR('Z', 15, struct zufs_ioc_break_all)
> +
> +/* ~~~  zufs_ioc_wait_operation ~~~ */
> +struct zufs_ioc_wait_operation {
> +       struct zufs_ioc_hdr hdr;
> +       /* maximum size is governed by zufs_ioc_init->max_command */
> +       char opt_buff[];
> +};
> +#define ZU_IOC_WAIT_OPT                _IOWR('Z', 16, struct
> zufs_ioc_wait_operation)
> +
> +/* These are the possible operations sent from Kernel to the Server in the
> + * return of the ZU_IOC_WAIT_OPT.
> + */
> +enum e_zufs_operation {
> +       ZUFS_OP_NULL = 0,
> +
> +       ZUFS_OP_BREAK,          /* Kernel telling Server to exit */
> +       ZUFS_OP_MAX_OPT,
> +};
> +
> +/* Allocate a special_file that will be a dual-port communication buffer with
> + * user mode.
> + * Server will access the buffer via the mmap of this file.
> + * Kernel will access the file via the valloc() pointer
> + *
> + * Some IOCTLs below demand use of this kind of buffer for communication
> + * TODO:
> + * pool_no is if we want to associate this buffer onto the 6 possible
> + * mem-pools per zuf_sbi. So anywhere we have a zu_dpp_t it will mean
> + * access from this pool.
> + * If pool_no is zero then it is private to only this file. In this case
> + * sb_id && zus_sbi are ignored / not needed.
> + */
> +struct zufs_ioc_alloc_buffer {
> +       struct zufs_ioc_hdr hdr;
> +       /* The ID of the super block received in mount */
> +       __u64   sb_id;
> +       /* We verify the sb_id validity against zus_sbi */
> +       struct zus_sb_info *zus_sbi;
> +       /* max size of buffer allowed (size of mmap) */
> +       __u32 max_size;
> +       /* allocate this much on initial call and set into vma */
> +       __u32 init_size;
> +
> +       /* TODO: These below are now set to ZERO. Need implementation */
> +       __u16 pool_no;
> +       __u16 flags;
> +       __u32 reserved;
> +};
> +#define ZU_IOC_ALLOC_BUFFER    _IOWR('Z', 17, struct zufs_ioc_init)
> +
>  #endif /* _LINUX_ZUFS_API_H */
> --
> 2.20.1
>