On Wed, May 29, 2024 at 08:00:42PM +0200, Bernd Schubert wrote: > Signed-off-by: Bernd Schubert <bschubert@xxxxxxx> > --- > fs/fuse/dev.c | 3 ++ > fs/fuse/dev_uring.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++ > fs/fuse/dev_uring_i.h | 22 +++++++++ > include/uapi/linux/fuse.h | 3 ++ > 4 files changed, 142 insertions(+) > > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c > index bc77413932cf..349c1d16b0df 100644 > --- a/fs/fuse/dev.c > +++ b/fs/fuse/dev.c > @@ -2470,6 +2470,9 @@ const struct file_operations fuse_dev_operations = { > .fasync = fuse_dev_fasync, > .unlocked_ioctl = fuse_dev_ioctl, > .compat_ioctl = compat_ptr_ioctl, > +#if IS_ENABLED(CONFIG_FUSE_IO_URING) I'm loathe to use #if IS_ENABLED() when we can use #ifdef CONFIG_FUSE_IO_URING which is more standard across the kernel. > + .mmap = fuse_uring_mmap, > +#endif > }; > EXPORT_SYMBOL_GPL(fuse_dev_operations); > > diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c > index 702a994cf192..9491bdaa5716 100644 > --- a/fs/fuse/dev_uring.c > +++ b/fs/fuse/dev_uring.c > @@ -120,3 +120,117 @@ void fuse_uring_ring_destruct(struct fuse_ring *ring) > > mutex_destroy(&ring->start_stop_lock); > } > + > +static inline int fuse_uring_current_nodeid(void) > +{ > + int cpu; > + const struct cpumask *proc_mask = current->cpus_ptr; > + > + cpu = cpumask_first(proc_mask); > + > + return cpu_to_node(cpu); You don't need this, just use numa_node_id(); > +} > + > +static char *fuse_uring_alloc_queue_buf(int size, int node) > +{ > + char *buf; > + > + if (size <= 0) { > + pr_info("Invalid queue buf size: %d.\n", size); > + return ERR_PTR(-EINVAL); > + } > + > + buf = vmalloc_node_user(size, node); > + return buf ? buf : ERR_PTR(-ENOMEM); > +} This is excessive, we base size off of ring->queue_buf_size, or the fuse_uring_mmap() size we get from the vma, which I don't think can ever be 0 or negative. I think we just validate that ->queue_buf_size is always correct, and if we're really worried about it in fuse_uring_mmap we validate that sz is correct there, and then we just use vmalloc_node_user() directly instead of having this helper. > + > +/** > + * fuse uring mmap, per ring qeuue. > + * Userpsace maps a kernel allocated ring/queue buffer. For numa awareness, > + * userspace needs to run the do the mapping from a core bound thread. > + */ > +int > +fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma) I'm not seeing anywhere else in the fuse code that has this style, I'd prefer we keep it consistent with the rest of the kernel and have int fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma) additionally you're using the docstyle strings without the actual docstyle formatting, which is pissing off my git hooks that run checkpatch. Not a big deal, but if you're going to provide docstyle comments then please do it formatted properly, or just do a normal /* * fuse uring mmap.... */ > +{ > + struct fuse_dev *fud = fuse_get_dev(filp); > + struct fuse_conn *fc; > + struct fuse_ring *ring; > + size_t sz = vma->vm_end - vma->vm_start; > + int ret; > + struct fuse_uring_mbuf *new_node = NULL; > + void *buf = NULL; > + int nodeid; > + > + if (vma->vm_pgoff << PAGE_SHIFT != FUSE_URING_MMAP_OFF) { > + pr_debug("Invalid offset, expected %llu got %lu\n", > + FUSE_URING_MMAP_OFF, vma->vm_pgoff << PAGE_SHIFT); > + return -EINVAL; > + } > + > + if (!fud) > + return -ENODEV; > + fc = fud->fc; > + ring = fc->ring; > + if (!ring) > + return -ENODEV; > + > + nodeid = ring->numa_aware ? fuse_uring_current_nodeid() : NUMA_NO_NODE; nodeid = ring->numa_awayre ? numa_node_id() : NUMA_NO_NODE; > + > + /* check if uring is configured and if the requested size matches */ > + if (ring->nr_queues == 0 || ring->queue_depth == 0) { > + ret = -EINVAL; > + goto out; > + } > + > + if (sz != ring->queue_buf_size) { > + ret = -EINVAL; > + pr_devel("mmap size mismatch, expected %zu got %zu\n", > + ring->queue_buf_size, sz); > + goto out; > + } > + > + if (current->nr_cpus_allowed != 1 && ring->numa_aware) { > + ret = -EINVAL; > + pr_debug( > + "Numa awareness, but thread has more than allowed cpu.\n"); > + goto out; > + } > + > + buf = fuse_uring_alloc_queue_buf(ring->queue_buf_size, nodeid); > + if (IS_ERR(buf)) { > + ret = PTR_ERR(buf); > + goto out; > + } All of the above you can just return ret, you don't have to jump to out. > + > + new_node = kmalloc(sizeof(*new_node), GFP_USER); > + if (unlikely(new_node == NULL)) { > + ret = -ENOMEM; > + goto out; Here I would just if (unlikely(new_node == NULL)) { vfree(buf); return -ENOMEM; } > + } > + > + ret = remap_vmalloc_range(vma, buf, 0); > + if (ret) > + goto out; And since this is the only place we can fail with both things allocated I'd just if (ret) { vfree(buf); kfree(new_node); return ret; } and then drop the bit below where you free the buffers if there's an error. > + > + mutex_lock(&ring->start_stop_lock); > + /* > + * In this function we do not know the queue the buffer belongs to. > + * Later server side will pass the mmaped address, the kernel address > + * will be found through the map. > + */ > + new_node->kbuf = buf; > + new_node->ubuf = (void *)vma->vm_start; > + rb_add(&new_node->rb_node, &ring->mem_buf_map, > + fuse_uring_rb_tree_buf_less); > + mutex_unlock(&ring->start_stop_lock); > +out: > + if (ret) { > + kfree(new_node); > + vfree(buf); > + } > + > + pr_devel("%s: pid %d addr: %p sz: %zu ret: %d\n", __func__, > + current->pid, (char *)vma->vm_start, sz, ret); > + > + return ret; > +} > diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h > index 58ab4671deff..c455ae0e729a 100644 > --- a/fs/fuse/dev_uring_i.h > +++ b/fs/fuse/dev_uring_i.h > @@ -181,6 +181,7 @@ struct fuse_ring { > > void fuse_uring_abort_end_requests(struct fuse_ring *ring); > int fuse_uring_conn_cfg(struct fuse_ring *ring, struct fuse_ring_config *rcfg); > +int fuse_uring_mmap(struct file *filp, struct vm_area_struct *vma); > int fuse_uring_queue_cfg(struct fuse_ring *ring, > struct fuse_ring_queue_config *qcfg); > void fuse_uring_ring_destruct(struct fuse_ring *ring); > @@ -208,6 +209,27 @@ static inline void fuse_uring_conn_destruct(struct fuse_conn *fc) > kfree(ring); > } > > +static inline int fuse_uring_rb_tree_buf_cmp(const void *key, > + const struct rb_node *node) > +{ > + const struct fuse_uring_mbuf *entry = > + rb_entry(node, struct fuse_uring_mbuf, rb_node); > + > + if (key == entry->ubuf) > + return 0; > + > + return (unsigned long)key < (unsigned long)entry->ubuf ? -1 : 1; > +} > + > +static inline bool fuse_uring_rb_tree_buf_less(struct rb_node *node1, > + const struct rb_node *node2) > +{ > + const struct fuse_uring_mbuf *entry1 = > + rb_entry(node1, struct fuse_uring_mbuf, rb_node); > + > + return fuse_uring_rb_tree_buf_cmp(entry1->ubuf, node2) < 0; > +} > + These are only used in dev_uring.c, just put them in there instead of the header file. Thanks, Josef