This is a note to let you know that I've just added the patch titled io_uring/kbuf: defer release of mapped buffer rings to the 6.6-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: io_uring-kbuf-defer-release-of-mapped-buffer-rings.patch and it can be found in the queue-6.6 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From c392cbecd8eca4c53f2bf508731257d9d0a21c2d Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe@xxxxxxxxx> Date: Mon, 27 Nov 2023 16:47:04 -0700 Subject: io_uring/kbuf: defer release of mapped buffer rings From: Jens Axboe <axboe@xxxxxxxxx> commit c392cbecd8eca4c53f2bf508731257d9d0a21c2d upstream. If a provided buffer ring is setup with IOU_PBUF_RING_MMAP, then the kernel allocates the memory for it and the application is expected to mmap(2) this memory. However, io_uring uses remap_pfn_range() for this operation, so we cannot rely on normal munmap/release on freeing them for us. Stash an io_buf_free entry away for each of these, if any, and provide a helper to free them post ->release(). Cc: stable@xxxxxxxxxxxxxxx Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring") Reported-by: Jann Horn <jannh@xxxxxxxxxx> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- include/linux/io_uring_types.h | 3 ++ io_uring/io_uring.c | 2 + io_uring/kbuf.c | 44 ++++++++++++++++++++++++++++++++++++----- io_uring/kbuf.h | 2 + 4 files changed, 46 insertions(+), 5 deletions(-) --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -327,6 +327,9 @@ struct io_ring_ctx { struct list_head io_buffers_cache; + /* deferred free list, protected by ->uring_lock */ + struct hlist_head io_buf_list; + /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -323,6 +323,7 @@ static __cold struct io_ring_ctx *io_rin INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); + INIT_HLIST_HEAD(&ctx->io_buf_list); io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, @@ -2942,6 +2943,7 @@ static __cold void io_ring_ctx_free(stru ctx->mm_account = NULL; } io_rings_free(ctx); + io_kbuf_mmap_list_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -41,6 +41,11 @@ static struct io_buffer_list *__io_buffe return xa_load(&ctx->io_bl_xa, bgid); } +struct io_buf_free { + struct hlist_node list; + void *mem; +}; + static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { @@ -238,7 +243,10 @@ static int __io_remove_buffers(struct io if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; if (bl->is_mmap) { - folio_put(virt_to_folio(bl->buf_ring)); + /* + * io_kbuf_list_free() will free the page(s) at + * ->release() time. + */ bl->buf_ring = NULL; bl->is_mmap = 0; } else if (bl->buf_nr_pages) { @@ -552,18 +560,28 @@ error_unpin: return -EINVAL; } -static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, +static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, + struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + struct io_buf_free *ibf; size_t ring_size; void *ptr; ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); + ptr = io_mem_alloc(ring_size); if (!ptr) return -ENOMEM; + /* Allocate and store deferred free entry */ + ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); + if (!ibf) { + io_mem_free(ptr); + return -ENOMEM; + } + ibf->mem = ptr; + hlist_add_head(&ibf->list, &ctx->io_buf_list); + bl->buf_ring = ptr; bl->is_mapped = 1; bl->is_mmap = 1; @@ -622,7 +640,7 @@ int io_register_pbuf_ring(struct io_ring if (!(reg.flags & IOU_PBUF_RING_MMAP)) ret = io_pin_pbuf_ring(®, bl); else - ret = io_alloc_pbuf_ring(®, bl); + ret = io_alloc_pbuf_ring(ctx, ®, bl); if (!ret) { bl->nr_entries = reg.ring_entries; @@ -682,3 +700,19 @@ void *io_pbuf_get_address(struct io_ring return bl->buf_ring; } + +/* + * Called at or after ->release(), free the mmap'ed buffers that we used + * for memory mapped provided buffer rings. + */ +void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) +{ + struct io_buf_free *ibf; + struct hlist_node *tmp; + + hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { + hlist_del(&ibf->list); + io_mem_free(ibf->mem); + kfree(ibf); + } +} --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -54,6 +54,8 @@ int io_provide_buffers(struct io_kiocb * int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); + unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); Patches currently in stable-queue which might be from axboe@xxxxxxxxx are queue-6.6/io_uring-enable-io_mem_alloc-free-to-be-used-in-other-parts.patch queue-6.6/io_uring-kbuf-defer-release-of-mapped-buffer-rings.patch queue-6.6/io_uring-don-t-allow-discontig-pages-for-ioring_setup_no_mmap.patch queue-6.6/bcache-revert-replacing-is_err_or_null-with-is_err.patch queue-6.6/powerpc-don-t-clobber-f0-vs0-during-fp-altivec-register-save.patch queue-6.6/io_uring-free-io_buffer_list-entries-via-rcu.patch queue-6.6/io_uring-don-t-guard-ioring_off_pbuf_ring-with-setup_no_mmap.patch