Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is only called on exit, but __io_uring_free (which frees the tctx in which the registered ring pointers are stored) is also called on execve (via begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> io_uring_cancel_generic -> __io_uring_free). This means: A process going through execve while having registered rings will leak references to the rings' `struct file`. Fix it by zapping registered rings on execve(). This is implemented by moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its callee __io_uring_cancel(), which is called from io_uring_task_cancel() on execve. This could probably be exploited *on 32-bit kernels* by leaking 2^32 references to the same ring, because the file refcount is stored in a pointer-sized field and get_file() doesn't have protection against refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no impact beyond a memory leak. Cc: stable@xxxxxxxxxxxxxxx Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") Signed-off-by: Jann Horn <jannh@xxxxxxxxxx> --- testcase: ``` #define _GNU_SOURCE #include <err.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/mman.h> #include <sys/syscall.h> #include <linux/io_uring.h> #define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1) \ err(1, "SYSCHK(" #x ")"); \ __res; \ }) #define NUM_SQ_PAGES 4 static int uring_fd; int main(void) { int memfd_sq = SYSCHK(memfd_create("", 0)); int memfd_cq = SYSCHK(memfd_create("", 0)); SYSCHK(ftruncate(memfd_sq, NUM_SQ_PAGES * 0x1000)); SYSCHK(ftruncate(memfd_cq, NUM_SQ_PAGES * 0x1000)); // sq void *sq_data = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_sq, 0)); // cq void *cq_data = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_cq, 0)); *(volatile unsigned int *)(cq_data+4) = 64 * NUM_SQ_PAGES; close(memfd_sq); close(memfd_cq); // initialize uring struct io_uring_params params = { .flags = IORING_SETUP_REGISTERED_FD_ONLY|IORING_SETUP_NO_MMAP|IORING_SETUP_NO_SQARRAY, .sq_off = { .user_addr = (unsigned long)sq_data }, .cq_off = { .user_addr = (unsigned long)cq_data } }; uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/10, ¶ms)); // re-execute execl("/proc/self/exe", NULL); err(1, "execve"); } ``` Leave it running for a while and monitor `grep ^filp /proc/slabinfo` and memory usage - without the patch, both will go up slowly but steadily. --- include/linux/io_uring.h | 4 +--- io_uring/io_uring.c | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e123d5e17b526148054872ee513f665adea80eb6..85fe4e6b275c7de260ea9a8552b8e1c3e7f7e5ec 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { - if (current->io_uring) { - io_uring_unreg_ringfd(); + if (current->io_uring) __io_uring_cancel(false); - } } static inline void io_uring_task_cancel(void) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06ff41484e29c6e7d8779bd7ff8317ebae003a8d..b1e888999cea137e043bf00372576861182857ac 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3214,6 +3214,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) void __io_uring_cancel(bool cancel_all) { + io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } --- base-commit: aef25be35d23ec768eed08bfcf7ca3cf9685bc28 change-id: 20241218-uring-reg-ring-cleanup-15dd7343194a -- Jann Horn <jannh@xxxxxxxxxx>