Hi, I noticed that io_uring's memory accounting behaves weirdly when IORING_REGISTER_CLONE_BUFFERS is used to clone buffers from uring instance A to uring instance B, where A and B use different MMs for accounting. If I first close uring instance A and then uring instance B, the pinned memory counters for uring instance B will be subtracted, even though the pinned memory was originally accounted through uring instance A; so the MM of uring instance B can end up with negative locked memory. Here is a testcase: ``` #define _GNU_SOURCE #include <err.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <sys/syscall.h> #include <sys/mman.h> #include <sys/uio.h> #include <linux/io_uring.h> /* for building with outdated kernel headers */ #if 1 enum { IORING_REGISTER_SRC_REGISTERED = (1U << 0), IORING_REGISTER_DST_REPLACE = (1U << 1), }; struct io_uring_clone_buffers { __u32 src_fd; __u32 flags; __u32 src_off; __u32 dst_off; __u32 nr; __u32 pad[3]; }; #define IORING_REGISTER_CLONE_BUFFERS 30 #endif #define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1) \ err(1, "SYSCHK(" #x ")"); \ __res; \ }) #define NUM_SQ_PAGES 4 static int uring_init(struct io_uring_sqe **sqesp, void **cqesp) { struct io_uring_sqe *sqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)); void *cqes = SYSCHK(mmap(NULL, NUM_SQ_PAGES*0x1000, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)); *(volatile unsigned int *)(cqes+4) = 64 * NUM_SQ_PAGES; struct io_uring_params params = { .flags = IORING_SETUP_NO_MMAP|IORING_SETUP_NO_SQARRAY, .sq_off = { .user_addr = (unsigned long)sqes }, .cq_off = { .user_addr = (unsigned long)cqes } }; int uring_fd = SYSCHK(syscall(__NR_io_uring_setup, /*entries=*/10, ¶ms)); if (sqesp) *sqesp = sqes; if (cqesp) *cqesp = cqes; return uring_fd; } int main(int argc, char **argv) { if (argc == 1) { int ring1 = uring_init(NULL, NULL); SYSCHK(fcntl(ring1, F_SETFD, 0)); /* clear O_CLOEXEC */ char *bufmem = SYSCHK(mmap(NULL, 0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)); struct iovec reg_iov = { .iov_base = bufmem, .iov_len = 0x1000 }; SYSCHK(syscall(__NR_io_uring_register, ring1, IORING_REGISTER_BUFFERS, ®_iov, 1)); char fd_str[100]; sprintf(fd_str, "%d", ring1); execlp(argv[0], argv[0], fd_str, NULL); err(1, "reexec"); } else if (argc == 2) { int ring1 = atoi(argv[1]); int ring2 = uring_init(NULL, NULL); struct io_uring_clone_buffers arg = { .src_fd = ring1, .flags = 0, .src_off = 0, .dst_off = 0, .nr = 1 }; SYSCHK(syscall(__NR_io_uring_register, ring2, IORING_REGISTER_CLONE_BUFFERS, &arg, 1)); close(ring1); close(ring2); system("cat /proc/$PPID/status"); return 0; } else { errx(1, "please run without any arguments"); } } ``` Result: ``` $ gcc -o uring-buf-deaccount uring-buf-deaccount.c $ ./uring-buf-deaccount Name: uring-buf-deacc Umask: 0002 State: S (sleeping) Tgid: 1162 Ngid: 0 Pid: 1162 PPid: 968 TracerPid: 0 Uid: 1000 1000 1000 1000 Gid: 1000 1000 1000 1000 FDSize: 256 Groups: 1000 NStgid: 1162 NSpid: 1162 NSpgid: 1162 NSsid: 968 Kthread: 0 VmPeak: 2540 kB VmSize: 2456 kB VmLck: 0 kB VmPin: 18446744073709551612 kB VmHWM: 1264 kB VmRSS: 1264 kB RssAnon: 0 kB RssFile: 1264 kB RssShmem: 0 kB [...] ``` Note the "VmPin: 18446744073709551612 kB", that's 0xfffffffffffffffc or -4. This doesn't lead to anything particularly bad; it just means the memory usage accounting is off. Commit 7cc2a6eadcd7 ("io_uring: add IORING_REGISTER_COPY_BUFFERS method") says that the intended usecase for IORING_REGISTER_CLONE_BUFFERS are thread pools; maybe a reasonable fix would be to just refuse IORING_REGISTER_CLONE_BUFFERS between rings with different accounting contexts (meaning different ->user or ->mm_account)? If that restriction seems acceptable, I'd write a patch for that.