Hello, I experienced stuck tasks during a process' exit when using multiple io_uring instances on a 48/96-core system in a multi-threaded environment, where we use an io_uring per thread and a single pipe(2) to pass messages between the threads. When the program calls exit(2) without joining the threads or unmapping/closing the io_urings, the program gets stuck in the zombie state - sometimes leaving behind multiple <cpu>:<n>-events kernel-threads using a considerable amount of CPU. I can reproduce this behavior on Debian running Linux 5.15.6 with the reproducer below compiled with Debian's gcc (10.2.1-6): // gcc -Werror -Wall -O3 hang-pipe-reproducer.c -o hang-pipe-reproducer -pthread -luring #include <assert.h> #include <err.h> #include <errno.h> #include <liburing.h> #include <pthread.h> #include <semaphore.h> #include <stdio.h> #include <stdlib.h> #include <sys/sysinfo.h> #include <unistd.h> #define IORING_ENTRIES 8 #define UNUSED __attribute((unused)) static pthread_t* threads; static pthread_barrier_t init_barrier; static int sleep_fd, notify_fd; static sem_t sem; void* thread_func(UNUSED void* arg) { struct io_uring ring; int res = io_uring_queue_init(IORING_ENTRIES, &ring, 0); if (res) err(EXIT_FAILURE, "io_uring_queue_init failed"); pthread_barrier_wait(&init_barrier); for(;;) { struct io_uring_sqe* sqe = io_uring_get_sqe(&ring); assert(sqe); uint64_t buf; io_uring_prep_read(sqe, sleep_fd, &buf, sizeof(buf), 0); int res = io_uring_submit_and_wait(&ring, 1); if (res < 0) err(EXIT_FAILURE, "io_uring_submit_and_wait failed"); struct io_uring_cqe* cqe; res = io_uring_peek_cqe(&ring, &cqe); assert(!res); if (cqe->res < 0) { errno = -cqe->res; err(EXIT_FAILURE, "read failed"); } assert(cqe->res == sizeof(buf)); sem_post(&sem); io_uring_cqe_seen(&ring, cqe); } return NULL; } int main() { int cpus = get_nprocs(); int res = pthread_barrier_init(&init_barrier, NULL, cpus); if (res) err(EXIT_FAILURE, "pthread_barrier_init failed"); res = sem_init(&sem, 0, 0); if (res) err(EXIT_FAILURE, "sem_init failed"); printf("start %d io_uring threads\n", cpus); threads = malloc(sizeof(pthread_t) * cpus); if (!threads) err(EXIT_FAILURE, "malloc failed"); int fds[2]; res = pipe(fds); if (res) err(EXIT_FAILURE, "pipe failed"); sleep_fd = fds[0]; notify_fd = fds[1]; for (unsigned i = 0; i < cpus; ++i) { errno = pthread_create(&threads[i], NULL, thread_func, NULL); if (errno) err(EXIT_FAILURE, "pthread_create failed"); } // Write #cpus notifications printf("write %d notifications\n", cpus); const uint64_t n = 0x42; for (unsigned i = 0; i < cpus; ++i) { res = write(notify_fd, &n, sizeof(n)); if (res < 0) err(EXIT_FAILURE, "write failed"); assert(res == sizeof(n)); } // Await that all notifications were received for (unsigned i = 0; i < cpus; ++i) { sem_wait(&sem); } // Exit without resource cleanup exit(EXIT_SUCCESS); } Kernel info message about the hung task: INFO: task hang-pipe-repro:404364 blocked for more than 845 seconds. Tainted: G E 5.15.6 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:hang-pipe-repro state:D stack: 0 pid:404364 ppid: 19554 flags:0x00024004 Call Trace: <TASK> ? usleep_range+0x80/0x80 __schedule+0x2eb/0x910 ? usleep_range+0x80/0x80 schedule+0x44/0xa0 schedule_timeout+0xfc/0x140 ? __prepare_to_swait+0x4b/0x70 __wait_for_common+0xae/0x160 io_wq_put_and_exit+0xf9/0x330 io_uring_cancel_generic+0x200/0x2e0 ? finish_wait+0x80/0x80 do_exit+0xba/0xa90 do_group_exit+0x33/0xa0 get_signal+0x170/0x910 arch_do_signal_or_restart+0xf0/0x7a0 ? __schedule+0x2f3/0x910 ? __queue_work+0x1c8/0x3d0 exit_to_user_mode_prepare+0x119/0x180 syscall_exit_to_user_mode+0x23/0x40 do_syscall_64+0x48/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f2df15c59b9 RSP: 002b:00007f2dd4434de8 EFLAGS: 00000212 ORIG_RAX: 00000000000001aa RAX: 0000000000000001 RBX: 00007f2dd4434e30 RCX: 00007f2df15c59b9 RDX: 0000000000000001 RSI: 0000000000000001 RDI: 000000000000003f RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000008 R10: 0000000000000001 R11: 0000000000000212 R12: 00007f2dd4434e20 R13: 00007ffc8577b38f R14: 00007f2dd4434fc0 R15: 0000000000802000 </TASK> The trace ran through scripts/decode_stacktrace.sh Call Trace: <TASK> ? usleep_range (kernel/time/timer.c:1843) __schedule (kernel/sched/core.c:4944 kernel/sched/core.c:6291) ? usleep_range (kernel/time/timer.c:1843) schedule (./arch/x86/include/asm/bitops.h:207 (discriminator 1) ./include/asm-generic/bitops/instrumented-non-atomic.h:135 (discriminator 1) ./include/linux/thread_info.h:118 (discriminator 1) ./include/linux/sched.h:2107 (discriminator 1) kernel/sched/core.c:6372 (discriminator 1)) schedule_timeout (kernel/time/timer.c:1858) ? __prepare_to_swait (./include/linux/list.h:67 ./include/linux/list.h:100 kernel/sched/swait.c:89) __wait_for_common (kernel/sched/completion.c:86 kernel/sched/completion.c:106) io_wq_put_and_exit (./include/asm-generic/bitops/find.h:117 ./include/linux/nodemask.h:265 fs/io-wq.c:1216 fs/io-wq.c:1249) io_uring_cancel_generic (fs/io_uring.c:9753 fs/io_uring.c:9832) ? finish_wait (kernel/sched/wait.c:408) do_exit (kernel/exit.c:781) do_group_exit (./include/linux/sched/signal.h:269 kernel/exit.c:905) get_signal (./arch/x86/include/asm/current.h:15 kernel/signal.c:2758) arch_do_signal_or_restart (arch/x86/kernel/signal.c:865 (discriminator 1)) ? __schedule (kernel/sched/core.c:6299) ? __queue_work (./arch/x86/include/asm/paravirt.h:590 ./arch/x86/include/asm/qspinlock.h:56 ./include/linux/spinlock.h:216 ./include/linux/spinlock_api_smp.h:151 kernel/workqueue.c:1522) exit_to_user_mode_prepare (kernel/entry/common.c:174 kernel/entry/common.c:207) syscall_exit_to_user_mode (./arch/x86/include/asm/jump_label.h:55 ./arch/x86/include/asm/nospec-branch.h:289 ./arch/x86/include/asm/entry-common.h:94 kernel/entry/common.c:131 kernel/entry/common.c:302) do_syscall_64 (arch/x86/entry/common.c:87) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) RIP: 0033:0x7f2df15c59b9 RSP: 002b:00007f2dd4434de8 EFLAGS: 00000212 ORIG_RAX: 00000000000001aa RAX: 0000000000000001 RBX: 00007f2dd4434e30 RCX: 00007f2df15c59b9 RDX: 0000000000000001 RSI: 0000000000000001 RDI: 000000000000003f RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000008 R10: 0000000000000001 R11: 0000000000000212 R12: 00007f2dd4434e20 R13: 00007ffc8577b38f R14: 00007f2dd4434fc0 R15: 0000000000802000 </TASK> Using a 5.14 kernel the reproducer exits immediately. Florian Fischer