Hi there, Recently we found a possible deadlock in __schedule, which is triggered together with BPF functionalities. The reproducer is available and we successfully reproduce this bug with the up-to-date source code of https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git (commit: fcc79e1714e8c2b8e216dc3149812edd37884eef). The final locking scenario is very similar to one existing report from syzbot: https://lore.kernel.org/bpf/00000000000034c0520608242de6@xxxxxxxxxx/T/, while the call traces are different, and the aforementioned one seems to be ignored due to the lack of reproducer. Syzkaller hit 'possible deadlock in __schedule' bug. </TASK> FAULT_INJECTION: forcing a failure. name fail_usercopy, interval 1, probability 0, space 0, times 0 ====================================================== WARNING: possible circular locking dependency detected 6.12.0-rc7-00144-g66418447d27b #8 Not tainted ------------------------------------------------------ syz-executor144/330 is trying to acquire lock: ffffffffbcd2da38 ((console_sem).lock){....}-{2:2}, at: down_trylock+0x20/0xa0 kernel/locking/semaphore.c:139 but task is already holding lock: ffff888065cbd718 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested kernel/sched/core.c:598 [inline] ffff888065cbd718 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock kernel/sched/sched.h:1506 [inline] ffff888065cbd718 (&rq->__lock){-.-.}-{2:2}, at: rq_lock kernel/sched/sched.h:1805 [inline] ffff888065cbd718 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0x140/0x1e70 kernel/sched/core.c:6592 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&rq->__lock){-.-.}-{2:2}: lock_acquire+0x138/0x300 kernel/locking/lockdep.c:5825 _raw_spin_lock_nested+0x29/0x40 kernel/locking/spinlock.c:378 raw_spin_rq_lock_nested kernel/sched/core.c:598 [inline] raw_spin_rq_lock kernel/sched/sched.h:1506 [inline] __task_rq_lock+0xcf/0x410 kernel/sched/core.c:676 wake_up_new_task+0x4db/0x9b0 kernel/sched/core.c:4825 kernel_clone+0x356/0x5a0 kernel/fork.c:2817 user_mode_thread+0xd6/0x120 kernel/fork.c:2864 rest_init+0x23/0x300 init/main.c:712 start_kernel+0x4a3/0x540 init/main.c:1105 x86_64_start_reservations+0x24/0x30 arch/x86/kernel/head64.c:507 x86_64_start_kernel+0x79/0x80 arch/x86/kernel/head64.c:488 common_startup_64+0x12c/0x137 -> #1 (&p->pi_lock){-.-.}-{2:2}: lock_acquire+0x138/0x300 kernel/locking/lockdep.c:5825 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x8e/0xc0 kernel/locking/spinlock.c:162 class_raw_spinlock_irqsave_constructor include/linux/spinlock.h:551 [inline] try_to_wake_up+0x5f/0xe10 kernel/sched/core.c:4165 up+0x72/0x90 kernel/locking/semaphore.c:191 __up_console_sem kernel/printk/printk.c:343 [inline] __console_unlock+0xc3/0xf0 kernel/printk/printk.c:2844 __console_flush_and_unlock kernel/printk/printk.c:3241 [inline] console_unlock+0x16c/0x390 kernel/printk/printk.c:3279 vprintk_emit+0x670/0xb90 kernel/printk/printk.c:2407 dev_vprintk_emit+0x2bc/0x360 drivers/base/core.c:4942 dev_printk_emit+0x82/0xb0 drivers/base/core.c:4953 _dev_warn+0xcb/0xf0 drivers/base/core.c:5009 _request_firmware+0xe4e/0xf00 drivers/base/firmware_loader/main.c:938 request_firmware_work_func+0xda/0x1f0 drivers/base/firmware_loader/main.c:1195 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0x93b/0x13a0 kernel/workqueue.c:3310 worker_thread+0xa53/0xfc0 kernel/workqueue.c:3391 kthread+0x301/0x3b0 kernel/kthread.c:389 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 -> #0 ((console_sem).lock){....}-{2:2}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x1b82/0x5750 kernel/locking/lockdep.c:3904 __lock_acquire+0x139d/0x2040 kernel/locking/lockdep.c:5202 lock_acquire+0x138/0x300 kernel/locking/lockdep.c:5825 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x8e/0xc0 kernel/locking/spinlock.c:162 down_trylock+0x20/0xa0 kernel/locking/semaphore.c:139 __down_trylock_console_sem+0x99/0x120 kernel/printk/printk.c:326 console_trylock kernel/printk/printk.c:2827 [inline] console_trylock_spinning kernel/printk/printk.c:1990 [inline] vprintk_emit+0x414/0xb90 kernel/printk/printk.c:2406 _printk+0x7a/0xa0 kernel/printk/printk.c:2432 fail_dump lib/fault-inject.c:46 [inline] should_fail_ex+0x3be/0x570 lib/fault-inject.c:154 strncpy_from_user+0x36/0x230 lib/strncpy_from_user.c:118 strncpy_from_user_nofault+0x71/0x140 mm/maccess.c:186 bpf_probe_read_user_str_common kernel/trace/bpf_trace.c:215 [inline] ____bpf_probe_read_user_str kernel/trace/bpf_trace.c:224 [inline] bpf_probe_read_user_str+0x2a/0x70 kernel/trace/bpf_trace.c:221 bpf_prog_bc7c5c6b9645592f+0x3e/0x40 bpf_dispatcher_nop_func include/linux/bpf.h:1265 [inline] __bpf_prog_run include/linux/filter.h:701 [inline] bpf_prog_run include/linux/filter.h:708 [inline] __bpf_trace_run kernel/trace/bpf_trace.c:2316 [inline] bpf_trace_run4+0x30b/0x4d0 kernel/trace/bpf_trace.c:2359 __bpf_trace_sched_switch+0x1c6/0x2c0 include/trace/events/sched.h:222 trace_sched_switch+0x12a/0x190 include/trace/events/sched.h:222 __schedule+0xade/0x1e70 kernel/sched/core.c:6690 __schedule_loop kernel/sched/core.c:6770 [inline] schedule+0xab/0x260 kernel/sched/core.c:6785 exit_to_user_mode_loop kernel/entry/common.c:102 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x11d/0x250 kernel/entry/common.c:218 do_syscall_64+0xed/0x1c0 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x67/0x6f other info that might help us debug this: Chain exists of: (console_sem).lock --> &p->pi_lock --> &rq->__lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&rq->__lock); lock(&p->pi_lock); lock(&rq->__lock); lock((console_sem).lock); *** DEADLOCK *** 2 locks held by syz-executor144/330: #0: ffff888065cbd718 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock_nested kernel/sched/core.c:598 [inline] #0: ffff888065cbd718 (&rq->__lock){-.-.}-{2:2}, at: raw_spin_rq_lock kernel/sched/sched.h:1506 [inline] #0: ffff888065cbd718 (&rq->__lock){-.-.}-{2:2}, at: rq_lock kernel/sched/sched.h:1805 [inline] #0: ffff888065cbd718 (&rq->__lock){-.-.}-{2:2}, at: __schedule+0x140/0x1e70 kernel/sched/core.c:6592 #1: ffffffffbce55000 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #1: ffffffffbce55000 (rcu_read_lock){....}-{1:2}, at: rcu_read_lock include/linux/rcupdate.h:849 [inline] #1: ffffffffbce55000 (rcu_read_lock){....}-{1:2}, at: __bpf_trace_run kernel/trace/bpf_trace.c:2315 [inline] #1: ffffffffbce55000 (rcu_read_lock){....}-{1:2}, at: bpf_trace_run4+0x21e/0x4d0 kernel/trace/bpf_trace.c:2359 Syzkaller reproducer: # {Threaded:false Repeat:true RepeatTimes:0 Procs:1 Slowdown:1 Sandbox: SandboxArg:0 Leak:false NetInjection:false NetDevices:false NetReset:false Cgroups:false BinfmtMisc:false CloseFDs:false KCSAN:false DevlinkPCI:false NicVF:false USB:false VhciInjection:false Wifi:false IEEE802154:false Sysctl:false Swap:false UseTmpDir:false HandleSegv:false Trace:false LegacyOptions:{Collide:false Fault:false FaultCall:0 FaultNth:0}} r0 = bpf$PROG_LOAD(0x5, &(0x7f00000000c0)={0x11, 0xb, &(0x7f0000000640)=ANY=[@ANYBLOB="18000000000000000000000000000000180100002020702500000000002020207b1af8ff00000000bfa100000000000007010000f8ffffffb702000004000000b703000000000000850000007200000095"], &(0x7f0000000200)='GPL\x00', 0x0, 0x0, 0x0, 0x0, 0x0, '\x00', 0x0, @fallback, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @void, @value}, 0x90) bpf$BPF_RAW_TRACEPOINT_OPEN(0x11, &(0x7f0000000400)={&(0x7f0000000040)='sched_switch\x00', r0}, 0x10) bpf$MAP_CREATE(0x0, 0x0, 0x0) (fail_nth: 1) ... Omitted ... C reproducer: // autogenerated by syzkaller (https://github.com/google/syzkaller) #define _GNU_SOURCE #include <dirent.h> #include <endian.h> #include <errno.h> #include <fcntl.h> #include <signal.h> #include <stdarg.h> #include <stdbool.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/prctl.h> #include <sys/stat.h> #include <sys/syscall.h> #include <sys/types.h> #include <sys/wait.h> #include <time.h> #include <unistd.h> #ifndef __NR_bpf #define __NR_bpf 321 #endif static void sleep_ms(uint64_t ms) { usleep(ms * 1000); } static uint64_t current_time_ms(void) { struct timespec ts; if (clock_gettime(CLOCK_MONOTONIC, &ts)) exit(1); return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000; } static bool write_file(const char* file, const char* what, ...) { char buf[1024]; va_list args; va_start(args, what); vsnprintf(buf, sizeof(buf), what, args); va_end(args); buf[sizeof(buf) - 1] = 0; int len = strlen(buf); int fd = open(file, O_WRONLY | O_CLOEXEC); if (fd == -1) return false; if (write(fd, buf, len) != len) { int err = errno; close(fd); errno = err; return false; } close(fd); return true; } static int inject_fault(int nth) { int fd; fd = open("/proc/thread-self/fail-nth", O_RDWR); if (fd == -1) exit(1); char buf[16]; sprintf(buf, "%d", nth); if (write(fd, buf, strlen(buf)) != (ssize_t)strlen(buf)) exit(1); return fd; } static void kill_and_wait(int pid, int* status) { kill(-pid, SIGKILL); kill(pid, SIGKILL); for (int i = 0; i < 100; i++) { if (waitpid(-1, status, WNOHANG | __WALL) == pid) return; usleep(1000); } DIR* dir = opendir("/sys/fs/fuse/connections"); if (dir) { for (;;) { struct dirent* ent = readdir(dir); if (!ent) break; if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) continue; char abort[300]; snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort", ent->d_name); int fd = open(abort, O_WRONLY); if (fd == -1) { continue; } if (write(fd, abort, 1) < 0) { } close(fd); } closedir(dir); } else { } while (waitpid(-1, status, __WALL) != pid) { } } static void setup_test() { prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0); setpgrp(); write_file("/proc/self/oom_score_adj", "1000"); } static const char* setup_fault() { int fd = open("/proc/self/make-it-fail", O_WRONLY); if (fd == -1) return "CONFIG_FAULT_INJECTION is not enabled"; close(fd); fd = open("/proc/thread-self/fail-nth", O_WRONLY); if (fd == -1) return "kernel does not have systematic fault injection support"; close(fd); static struct { const char* file; const char* val; bool fatal; } files[] = { {"/sys/kernel/debug/failslab/ignore-gfp-wait", "N", true}, {"/sys/kernel/debug/fail_futex/ignore-private", "N", false}, {"/sys/kernel/debug/fail_page_alloc/ignore-gfp-highmem", "N", false}, {"/sys/kernel/debug/fail_page_alloc/ignore-gfp-wait", "N", false}, {"/sys/kernel/debug/fail_page_alloc/min-order", "0", false}, }; unsigned i; for (i = 0; i < sizeof(files) / sizeof(files[0]); i++) { if (!write_file(files[i].file, files[i].val)) { if (files[i].fatal) return "failed to write fault injection file"; } } return NULL; } static void execute_one(void); #define WAIT_FLAGS __WALL static void loop(void) { int iter = 0; for (;; iter++) { int pid = fork(); if (pid < 0) exit(1); if (pid == 0) { setup_test(); execute_one(); exit(0); } int status = 0; uint64_t start = current_time_ms(); for (;;) { sleep_ms(10); if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid) break; if (current_time_ms() - start < 5000) continue; kill_and_wait(pid, &status); break; } } } uint64_t r[1] = {0xffffffffffffffff}; void execute_one(void) { intptr_t res = 0; if (write(1, "executing program\n", sizeof("executing program\n") - 1)) { } *(uint32_t*)0x200000c0 = 0x11; *(uint32_t*)0x200000c4 = 0xb; *(uint64_t*)0x200000c8 = 0x20000640; memcpy((void*)0x20000640, "\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18" "\x01\x00\x00\x20\x20\x70\x25\x00\x00\x00\x00\x00\x20\x20\x20\x7b\x1a" "\xf8\xff\x00\x00\x00\x00\xbf\xa1\x00\x00\x00\x00\x00\x00\x07\x01\x00" "\x00\xf8\xff\xff\xff\xb7\x02\x00\x00\x04\x00\x00\x00\xb7\x03\x00\x00" "\x00\x00\x00\x00\x85\x00\x00\x00\x72\x00\x00\x00\x95", 81); *(uint64_t*)0x200000d0 = 0x20000200; memcpy((void*)0x20000200, "GPL\000", 4); *(uint32_t*)0x200000d8 = 0; *(uint32_t*)0x200000dc = 0; *(uint64_t*)0x200000e0 = 0; *(uint32_t*)0x200000e8 = 0; *(uint32_t*)0x200000ec = 0; memset((void*)0x200000f0, 0, 16); *(uint32_t*)0x20000100 = 0; *(uint32_t*)0x20000104 = 0; *(uint32_t*)0x20000108 = -1; *(uint32_t*)0x2000010c = 0; *(uint64_t*)0x20000110 = 0; *(uint32_t*)0x20000118 = 0; *(uint32_t*)0x2000011c = 0; *(uint64_t*)0x20000120 = 0; *(uint32_t*)0x20000128 = 0; *(uint32_t*)0x2000012c = 0; *(uint32_t*)0x20000130 = 0; *(uint32_t*)0x20000134 = 0; *(uint64_t*)0x20000138 = 0; *(uint64_t*)0x20000140 = 0; *(uint32_t*)0x20000148 = 0; *(uint32_t*)0x2000014c = 0; *(uint32_t*)0x20000150 = 0; res = syscall(__NR_bpf, /*cmd=*/5ul, /*arg=*/0x200000c0ul, /*size=*/0x90ul); if (res != -1) r[0] = res; *(uint64_t*)0x20000400 = 0x20000040; memcpy((void*)0x20000040, "sched_switch\000", 13); *(uint32_t*)0x20000408 = r[0]; *(uint32_t*)0x2000040c = 0; *(uint64_t*)0x20000410 = 0; syscall(__NR_bpf, /*cmd=*/0x11ul, /*arg=*/0x20000400ul, /*size=*/0x10ul); inject_fault(1); syscall(__NR_bpf, /*cmd=*/0ul, /*arg=*/0ul, /*size=*/0ul); } int main(void) { syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1, /*offset=*/0ul); syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul, /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1, /*offset=*/0ul); syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1, /*offset=*/0ul); const char* reason; (void)reason; if ((reason = setup_fault())) printf("the reproducer may not work as expected: fault injection setup " "failed: %s\n", reason); loop(); return 0; } Best, Bonan