On 1/14/25 10:16 AM, Jann Horn wrote: > Hi, > > I noticed that io_uring's memory accounting behaves weirdly when > IORING_REGISTER_CLONE_BUFFERS is used to clone buffers from uring > instance A to uring instance B, where A and B use different MMs for > accounting. If I first close uring instance A and then uring instance > B, the pinned memory counters for uring instance B will be subtracted, > even though the pinned memory was originally accounted through uring > instance A; so the MM of uring instance B can end up with negative > locked memory. > > Here is a testcase: > ``` > #define _GNU_SOURCE > #include <err.h> > #include <unistd.h> > #include <stdio.h> > #include <stdlib.h> > #include <fcntl.h> > #include <sys/syscall.h> > #include <sys/mman.h> > #include <sys/uio.h> > #include <linux/io_uring.h> > > /* for building with outdated kernel headers */ > #if 1 > enum { > IORING_REGISTER_SRC_REGISTERED = (1U << 0), > IORING_REGISTER_DST_REPLACE = (1U << 1), > }; > struct io_uring_clone_buffers { > __u32 src_fd; > __u32 flags; > __u32 src_off; > __u32 dst_off; > __u32 nr; > __u32 pad[3]; > }; > #define IORING_REGISTER_CLONE_BUFFERS 30 > #endif > > #define SYSCHK(x) ({ \ > typeof(x) __res = (x); \ > if (__res == (typeof(x))-1) \ > err(1, "SYSCHK(" #x ")"); \ > __res; \ > }) > > #define NUM_SQ_PAGES 4 > static int uring_init(struct io_uring_sqe **sqesp, void **cqesp) { > struct io_uring_sqe *sqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, > PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)); > void *cqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, > PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)); > *(volatile unsigned int *)(cqes+4) = 64 * NUM_SQ_PAGES; > struct io_uring_params params = { > .flags = IORING_SETUP_NO_MMAP|IORING_SETUP_NO_SQARRAY, > .sq_off = { .user_addr = (unsigned long)sqes }, > .cq_off = { .user_addr = (unsigned long)cqes } > }; > int uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/10, ¶ms)); > if (sqesp) > *sqesp = sqes; > if (cqesp) > *cqesp = cqes; > return uring_fd; > } > > int main(int argc, char **argv) { > if (argc == 1) { > int ring1 = uring_init(NULL, NULL); > SYSCHK(fcntl(ring1, F_SETFD, 0)); /* clear O_CLOEXEC */ > char *bufmem = SYSCHK(mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, > MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); > struct iovec reg_iov = { .iov_base = bufmem, .iov_len = 0x1000 }; > SYSCHK(syscall(__NR_io_uring_register, ring1, > IORING_REGISTER_BUFFERS, ®_iov, 1)); > char fd_str[100]; > sprintf(fd_str, "%d", ring1); > execlp(argv[0], argv[0], fd_str, NULL); > err(1, "reexec"); > } else if (argc == 2) { > int ring1 = atoi(argv[1]); > int ring2 = uring_init(NULL, NULL); > struct io_uring_clone_buffers arg = { > .src_fd = ring1, > .flags = 0, > .src_off = 0, > .dst_off = 0, > .nr = 1 > }; > SYSCHK(syscall(__NR_io_uring_register, ring2, > IORING_REGISTER_CLONE_BUFFERS, &arg, 1)); > close(ring1); > close(ring2); > system("cat /proc/$PPID/status"); > return 0; > } else { > errx(1, "please run without any arguments"); > } > } > ``` > > Result: > ``` > $ gcc -o uring-buf-deaccount uring-buf-deaccount.c > $ ./uring-buf-deaccount > Name: uring-buf-deacc > Umask: 0002 > State: S (sleeping) > Tgid: 1162 > Ngid: 0 > Pid: 1162 > PPid: 968 > TracerPid: 0 > Uid: 1000 1000 1000 1000 > Gid: 1000 1000 1000 1000 > FDSize: 256 > Groups: 1000 > NStgid: 1162 > NSpid: 1162 > NSpgid: 1162 > NSsid: 968 > Kthread: 0 > VmPeak: 2540 kB > VmSize: 2456 kB > VmLck: 0 kB > VmPin: 18446744073709551612 kB > VmHWM: 1264 kB > VmRSS: 1264 kB > RssAnon: 0 kB > RssFile: 1264 kB > RssShmem: 0 kB > [...] > ``` > > Note the "VmPin: 18446744073709551612 kB", that's 0xfffffffffffffffc or -4. > > This doesn't lead to anything particularly bad; it just means the > memory usage accounting is off. > > Commit 7cc2a6eadcd7 ("io_uring: add IORING_REGISTER_COPY_BUFFERS > method") says that the intended usecase for > IORING_REGISTER_CLONE_BUFFERS are thread pools; maybe a reasonable fix > would be to just refuse IORING_REGISTER_CLONE_BUFFERS between rings > with different accounting contexts (meaning different ->user or > ->mm_account)? If that restriction seems acceptable, I'd write a patch > for that. I think adding that restriction would be fine. The intended use case is sharing buffers anyway, which means they would need to be in the same address space to begin with. -- Jens Axboe