Signed-off-by: Bernd Schubert <bschubert@xxxxxxx> --- fs/fuse/dev.c | 3 ++ fs/fuse/dev_uring.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/dev_uring_i.h | 22 +++++++++ include/uapi/linux/fuse.h | 3 ++ 4 files changed, 142 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index bc77413932cf..349c1d16b0df 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2470,6 +2470,9 @@ const struct file_operations fuse_dev_operations = { .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, +#if IS_ENABLED(CONFIG_FUSE_IO_URING) + .mmap = fuse_uring_mmap, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 702a994cf192..9491bdaa5716 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -120,3 +120,117 @@ void fuse_uring_ring_destruct(struct fuse_ring *ring) mutex_destroy(&ring->start_stop_lock); } + +static inline int fuse_uring_current_nodeid(void) +{ + int cpu; + const struct cpumask *proc_mask = current->cpus_ptr; + + cpu = cpumask_first(proc_mask); + + return cpu_to_node(cpu); +} + +static char *fuse_uring_alloc_queue_buf(int size, int node) +{ + char *buf; + + if (size <= 0) { + pr_info("Invalid queue buf size: %d.\n", size); + return ERR_PTR(-EINVAL); + } + + buf = vmalloc_node_user(size, node); + return buf ? buf : ERR_PTR(-ENOMEM); +} + +/** + * fuse uring mmap, per ring qeuue. + * Userpsace maps a kernel allocated ring/queue buffer. For numa awareness, + * userspace needs to run the do the mapping from a core bound thread. + */ +int +fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct fuse_dev *fud = fuse_get_dev(filp); + struct fuse_conn *fc; + struct fuse_ring *ring; + size_t sz = vma->vm_end - vma->vm_start; + int ret; + struct fuse_uring_mbuf *new_node = NULL; + void *buf = NULL; + int nodeid; + + if (vma->vm_pgoff << PAGE_SHIFT != FUSE_URING_MMAP_OFF) { + pr_debug("Invalid offset, expected %llu got %lu\n", + FUSE_URING_MMAP_OFF, vma->vm_pgoff << PAGE_SHIFT); + return -EINVAL; + } + + if (!fud) + return -ENODEV; + fc = fud->fc; + ring = fc->ring; + if (!ring) + return -ENODEV; + + nodeid = ring->numa_aware ? fuse_uring_current_nodeid() : NUMA_NO_NODE; + + /* check if uring is configured and if the requested size matches */ + if (ring->nr_queues == 0 || ring->queue_depth == 0) { + ret = -EINVAL; + goto out; + } + + if (sz != ring->queue_buf_size) { + ret = -EINVAL; + pr_devel("mmap size mismatch, expected %zu got %zu\n", + ring->queue_buf_size, sz); + goto out; + } + + if (current->nr_cpus_allowed != 1 && ring->numa_aware) { + ret = -EINVAL; + pr_debug( + "Numa awareness, but thread has more than allowed cpu.\n"); + goto out; + } + + buf = fuse_uring_alloc_queue_buf(ring->queue_buf_size, nodeid); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out; + } + + new_node = kmalloc(sizeof(*new_node), GFP_USER); + if (unlikely(new_node == NULL)) { + ret = -ENOMEM; + goto out; + } + + ret = remap_vmalloc_range(vma, buf, 0); + if (ret) + goto out; + + mutex_lock(&ring->start_stop_lock); + /* + * In this function we do not know the queue the buffer belongs to. + * Later server side will pass the mmaped address, the kernel address + * will be found through the map. + */ + new_node->kbuf = buf; + new_node->ubuf = (void *)vma->vm_start; + rb_add(&new_node->rb_node, &ring->mem_buf_map, + fuse_uring_rb_tree_buf_less); + mutex_unlock(&ring->start_stop_lock); +out: + if (ret) { + kfree(new_node); + vfree(buf); + } + + pr_devel("%s: pid %d addr: %p sz: %zu ret: %d\n", __func__, + current->pid, (char *)vma->vm_start, sz, ret); + + return ret; +} diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 58ab4671deff..c455ae0e729a 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -181,6 +181,7 @@ struct fuse_ring { void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_conn_cfg(struct fuse_ring *ring, struct fuse_ring_config *rcfg); +int fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma); int fuse_uring_queue_cfg(struct fuse_ring *ring, struct fuse_ring_queue_config *qcfg); void fuse_uring_ring_destruct(struct fuse_ring *ring); @@ -208,6 +209,27 @@ static inline void fuse_uring_conn_destruct(struct fuse_conn *fc) kfree(ring); } +static inline int fuse_uring_rb_tree_buf_cmp(const void *key, + const struct rb_node *node) +{ + const struct fuse_uring_mbuf *entry = + rb_entry(node, struct fuse_uring_mbuf, rb_node); + + if (key == entry->ubuf) + return 0; + + return (unsigned long)key < (unsigned long)entry->ubuf ? -1 : 1; +} + +static inline bool fuse_uring_rb_tree_buf_less(struct rb_node *node1, + const struct rb_node *node2) +{ + const struct fuse_uring_mbuf *entry1 = + rb_entry(node1, struct fuse_uring_mbuf, rb_node); + + return fuse_uring_rb_tree_buf_cmp(entry1->ubuf, node2) < 0; +} + static inline struct fuse_ring_queue * fuse_uring_get_queue(struct fuse_ring *ring, int qid) { diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 0449640f2501..00d0154ec2da 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1259,4 +1259,7 @@ struct fuse_supp_groups { #define FUSE_RING_HEADER_BUF_SIZE 4096 #define FUSE_RING_MIN_IN_OUT_ARG_SIZE 4096 +/* The offset parameter is used to identify the request type */ +#define FUSE_URING_MMAP_OFF 0xf8000000ULL + #endif /* _LINUX_FUSE_H */ -- 2.40.1