Hi all, I apologize for waking up the futex demons (and replying to my own email), but ... Gratian Crisan writes: > > Brandon and I have been debugging a nasty race that leads to > BUG_ON(!newowner) in fixup_pi_state_owner() in futex.c. So far > we've only been able to reproduce the issue on 4.9.y-rt kernels. > We are still testing if this is a problem for later RT branches. I was able to reproduce the BUG_ON(!newowner) in fixup_pi_state_owner() with a 5.10.0-rc1-rt1 kernel (currently testing 5.10.0-rc2-rt4). The kernel branch I had set up for this test is available here: https://github.com/gratian/linux/commits/devel-rt/v5.10-rc1-rt1-ni-serial It has a minimal set of commits on top of devel-rt/5.10.0-rc1-rt1 to enable the UART on this hardware and add a kgdb breakpoint (so I can extract more information before the BUG_ON). I've captured the reproducer, boot messages, OOPS, full event ftrace dump and some kgdb info here: https://github.com/gratian/traces The abbreviated OOPS (w/o the ftrace dump since it's too large to include inline; see link[1]): [15812.238958] ------------[ cut here ]------------ [15812.238963] kernel BUG at kernel/futex.c:2399! [15812.238974] invalid opcode: 0000 [#1] PREEMPT_RT SMP PTI [15812.261428] CPU: 1 PID: 1972 Comm: f_waiter Tainted: G U W 5.10.0-rc1-rt1-00015-g10c3af983f2e #1 [15812.271341] Hardware name: National Instruments NI cRIO-9035/NI cRIO-9035, BIOS 1.3.0f1 07/18/2016 [15812.280298] RIP: 0010:fixup_pi_state_owner.isra.0+0x2d7/0x380 [15812.286047] Code: 0f 85 a5 fe ff ff 48 8b 7d b8 e8 e4 fe 6b 00 85 c0 0f 85 94 fe ff ff 4d 8b 75 28 49 83 e6 fe 0f 85 bd fd ff ff e8 f9 f2 02 00 <0f> 0b 48 8b 45 a8 48 8b 38 e8 ab 1a 6c 00 48 8b 7d b8 e8 52 19 6c [15812.304817] RSP: 0018:ffffc90001877b88 EFLAGS: 00010046 [15812.310043] RAX: 0000000000000000 RBX: ffff888006c52640 RCX: 0000000000000001 [15812.317177] RDX: 0000000000000078 RSI: 000000000000005c RDI: ffff888003e7f028 [15812.324310] RBP: ffffc90001877c08 R08: 0000000000000001 R09: ffff888003e7f010 [15812.331443] R10: 00000000000001d8 R11: 0000000000000000 R12: 000056469b8770e4 [15812.338576] R13: ffff888003e7f000 R14: 0000000000000000 R15: ffff88801f281708 [15812.345713] FS: 00007f0d0f4e1700(0000) GS:ffff888037b00000(0000) knlGS:0000000000000000 [15812.353802] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [15812.359547] CR2: 00007f2f6e5830e1 CR3: 0000000008a36000 CR4: 00000000001006e0 [15812.366682] Call Trace: [15812.369131] fixup_owner+0x6a/0x70 [15812.372534] futex_wait_requeue_pi.constprop.0+0x5a1/0x6b0 [15812.378024] ? __hrtimer_init_sleeper+0x60/0x60 [15812.382562] do_futex+0x515/0xc90 [15812.385879] ? trace_buffer_unlock_commit_regs+0x40/0x1f0 [15812.391278] ? trace_event_buffer_commit+0x7b/0x260 [15812.396158] ? trace_event_raw_event_sys_enter+0x9a/0x100 [15812.401558] ? _copy_from_user+0x2b/0x60 [15812.405484] __x64_sys_futex+0x144/0x180 [15812.409410] do_syscall_64+0x38/0x50 [15812.412986] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [15812.418037] RIP: 0033:0x7f0d0ec6b4c9 [15812.421613] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 9f 39 2b 00 f7 d8 64 89 01 48 [15812.440383] RSP: 002b:00007f0d0f4e0ec8 EFLAGS: 00000216 ORIG_RAX: 00000000000000ca [15812.447952] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0d0ec6b4c9 [15812.455085] RDX: 00000000001a560b RSI: 000000000000008b RDI: 000056469b8770e0 [15812.462219] RBP: 00007f0d0f4e0f10 R08: 000056469b8770e4 R09: 0000000000000000 [15812.469352] R10: 00007f0d0f4e0f30 R11: 0000000000000216 R12: 0000000000000000 [15812.476485] R13: 00007ffdd55cb86f R14: 0000000000000000 R15: 00007f0d0f56e040 [15812.483622] Modules linked in: g_ether u_ether libcomposite udc_core ipv6 i915 mousedev hid_multitouch intel_gtt coretemp drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops aesni_intel glue_helper crypto_simd igb drm i2c_i801 lpc_ich mfd_core i2c_algo_bit i2c_smbus agpgart dwc3_pci video backlight button [last unloaded: rng_core] [15812.513719] Dumping ftrace buffer: [15812.517121] --------------------------------- <snip> see: [1] https://github.com/gratian/traces/blob/main/oops_ftrace-5.10.0-rc1-rt1-00015-g10c3af983f2e-dump1.txt [16492.155124] --------------------------------- [16492.159486] ---[ end trace 0000000000000003 ]--- [16492.159489] printk: enabled sync mode [16492.167762] RIP: 0010:fixup_pi_state_owner.isra.0+0x2d7/0x380 [16492.173513] Code: 0f 85 a5 fe ff ff 48 8b 7d b8 e8 e4 fe 6b 00 85 c0 0f 85 94 fe ff ff 4d 8b 75 28 49 83 e6 fe 0f 85 bd fd ff ff e8 f9 f2 02 00 <0f> 0b 48 8b 45 a8 48 8b 38 e8 ab 1a 6c 00 48 8b 7d b8 e8 52 19 6c [16492.192282] RSP: 0018:ffffc90001877b88 EFLAGS: 00010046 [16492.197508] RAX: 0000000000000000 RBX: ffff888006c52640 RCX: 0000000000000001 [16492.204642] RDX: 0000000000000078 RSI: 000000000000005c RDI: ffff888003e7f028 [16492.211776] RBP: ffffc90001877c08 R08: 0000000000000001 R09: ffff888003e7f010 [16492.218910] R10: 00000000000001d8 R11: 0000000000000000 R12: 000056469b8770e4 [16492.226043] R13: ffff888003e7f000 R14: 0000000000000000 R15: ffff88801f281708 [16492.233180] FS: 00007f0d0f4e1700(0000) GS:ffff888037b00000(0000) knlGS:0000000000000000 [16492.241270] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [16492.247016] CR2: 00007f2f6e5830e1 CR3: 0000000008a36000 CR4: 00000000001006e0 [16492.254150] Kernel panic - not syncing: Fatal exception [16493.395619] Shutting down cpus with NMI [16493.399460] Dumping ftrace buffer: [16493.402861] (ftrace buffer empty) [16493.406436] Kernel Offset: disabled [16493.409923] Rebooting in 30 seconds.. [16523.417644] ACPI MEMORY or I/O RESET_REG. I've also included the modified reproducer that triggers the issue below. Ideas? Thanks, Gratian --8<---------------cut here---------------start------------->8--- /* * fbomb_v2.c * Reproducer for BUG_ON(!newowner) in fixup_pi_state_owner() * <linux>/kernel/futex.c */ #define _GNU_SOURCE #include <stdlib.h> #include <stdint.h> #include <stdbool.h> #include <stdarg.h> #include <string.h> #include <stdio.h> #include <unistd.h> #include <pthread.h> #include <err.h> #include <errno.h> #include <sched.h> #include <time.h> #include <sys/types.h> #include <sys/syscall.h> #include <sys/sysinfo.h> #include <linux/futex.h> #include <limits.h> #define TEST_CPU 1 #define NSEC_PER_USEC 1000ULL #define NSEC_PER_SEC 1000000000ULL #define NOISE_THREADS 64 #define FUTEX_TID_MASK 0x3fffffff typedef uint32_t futex_t; static futex_t cond; static futex_t lock1 = 0; static futex_t lock2 = 0; static pthread_barrier_t start_barrier; static int futex_lock_pi(futex_t *futex) { int ret; pid_t tid; futex_t zero = 0; if (!futex) return EINVAL; tid = syscall(SYS_gettid); if (tid == (*futex & FUTEX_TID_MASK)) return EDEADLOCK; ret = __atomic_compare_exchange_n(futex, &zero, tid, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); /* no pending waiters; we got the lock */ if (ret) return 0; ret = syscall(SYS_futex, futex, FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG, 0, NULL, NULL, 0); return (ret) ? errno : 0; } static int futex_unlock_pi(futex_t *futex) { int ret; pid_t tid; if (!futex) return EINVAL; tid = syscall(SYS_gettid); if (tid != (*futex & FUTEX_TID_MASK)) return EPERM; ret = __atomic_compare_exchange_n(futex, &tid, 0, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); if (ret) return 0; ret = syscall(SYS_futex, futex, FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG, 0, NULL, NULL, 0); return (ret) ? errno : 0; } static int futex_wait_requeue_pi(futex_t *cond, futex_t *futex, const struct timespec *to) { int ret; (*cond)++; futex_unlock_pi(futex); ret = syscall(SYS_futex, cond, FUTEX_WAIT_REQUEUE_PI | FUTEX_PRIVATE_FLAG, *cond, to, futex, 0); if (ret) { ret = errno; /* on error, the kernel didn't acquire the lock for us */ futex_lock_pi(futex); } return ret; } static int futex_cmp_requeue_pi(futex_t *cond, futex_t *futex) { int ret; (*cond)++; ret = syscall(SYS_futex, cond, FUTEX_CMP_REQUEUE_PI | FUTEX_PRIVATE_FLAG, 1, INT_MAX, futex, *cond); return (ret) ? errno : 0; } static void tsnorm(struct timespec *ts) { while (ts->tv_nsec >= NSEC_PER_SEC) { ts->tv_nsec -= NSEC_PER_SEC; ts->tv_sec++; } } static unsigned long long tsdiff(struct timespec *start, struct timespec *end) { unsigned long long t1 = (unsigned long long)(start->tv_sec) * NSEC_PER_SEC + start->tv_nsec; unsigned long long t2 = (unsigned long long)(end->tv_sec) * NSEC_PER_SEC + end->tv_nsec; return t2 - t1; } static void set_cpu_affinity(int cpu) { cpu_set_t mask; CPU_ZERO(&mask); CPU_SET(cpu, &mask); if (sched_setaffinity(0, sizeof(mask), &mask) < 0) err(1, "Failed to set the CPU affinity"); } static void clr_cpu_affinity() { cpu_set_t mask; int i; CPU_ZERO(&mask); for (i = 0; i < get_nprocs(); i++) CPU_SET(i, &mask); if (sched_setaffinity(0, sizeof(mask), &mask) < 0) err(1, "Failed to clear the CPU affinity"); } static void set_fifo_priority(int prio) { struct sched_param schedp; memset(&schedp, 0, sizeof(schedp)); schedp.sched_priority = prio; if (sched_setscheduler(0, SCHED_FIFO, &schedp) < 0) err(1, "Failed to set the test thread priority"); } static void do_busy_work(unsigned long long nsec) { struct timespec start_ts; struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &start_ts); do { clock_gettime(CLOCK_MONOTONIC, &ts); } while (tsdiff(&start_ts, &ts) < nsec); } static void* boosted_thread(void *param) { struct timespec sleep_ts = {.tv_sec = 0, .tv_nsec = 600000ULL}; while (1) { set_cpu_affinity(TEST_CPU); pthread_barrier_wait(&start_barrier); clr_cpu_affinity(); futex_lock_pi(&lock2); clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL); futex_lock_pi(&lock1); futex_unlock_pi(&lock1); futex_unlock_pi(&lock2); } } #define SWEEP_START 500000ULL #define SWEEP_END 900000ULL #define SWEEP_STEP 100 static void* rt_thread(void *param) { struct timespec sleep_ts; unsigned long long nsec; set_cpu_affinity(TEST_CPU); set_fifo_priority(7); nsec = SWEEP_START; while(1) { pthread_barrier_wait(&start_barrier); nsec += SWEEP_STEP; if (nsec > SWEEP_END) nsec = SWEEP_START; sleep_ts.tv_sec = 0; sleep_ts.tv_nsec = nsec; clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL); /* preempt the waiter thread right after it got * signaled and allow the boosted_thread to block on * lock1 after taking lock2 */ do_busy_work(2000000ULL); /* this should boost the boosted_thread and migrate it */ futex_lock_pi(&lock2); futex_unlock_pi(&lock2); } } static void* waiter_thread(void *param) { struct timespec ts; set_cpu_affinity(TEST_CPU); while(1) { pthread_barrier_wait(&start_barrier); futex_lock_pi(&lock1); clock_gettime(CLOCK_MONOTONIC, &ts); ts.tv_nsec += 800000ULL; tsnorm(&ts); futex_wait_requeue_pi(&cond, &lock1, &ts); futex_unlock_pi(&lock1); } } static void* waker_thread(void *param) { struct timespec sleep_ts = {.tv_sec = 0, .tv_nsec = 500000ULL}; set_cpu_affinity(TEST_CPU); while(1) { pthread_barrier_wait(&start_barrier); clock_nanosleep(CLOCK_MONOTONIC, 0, &sleep_ts, NULL); futex_lock_pi(&lock1); futex_cmp_requeue_pi(&cond, &lock1); futex_unlock_pi(&lock1); } } static void* noise_thread(void *param) { struct timespec ts; set_cpu_affinity(0); while(1) { ts.tv_sec = 0; ts.tv_nsec = random() % 1000000; clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL); } } int main(int argc, char *argv[]) { pthread_t waiter; pthread_t waker; pthread_t rt; pthread_t boosted; pthread_t noise[NOISE_THREADS]; int i; if (pthread_barrier_init(&start_barrier, NULL, 4)) err(1, "Failed to create start_barrier"); if (pthread_create(&waker, NULL, waker_thread, NULL) != 0) err(1, "Failed to create waker thread"); pthread_setname_np(waker, "f_waker"); if (pthread_create(&waiter, NULL, waiter_thread, NULL) != 0) err(1, "Failed to create waiter thread"); pthread_setname_np(waiter, "f_waiter"); if (pthread_create(&rt, NULL, rt_thread, NULL) != 0) err(1, "Failed to create rt thread"); pthread_setname_np(rt, "f_rt"); if (pthread_create(&boosted, NULL, boosted_thread, NULL) != 0) err(1, "Failed to create boosted thread"); pthread_setname_np(boosted, "f_boosted"); for (i = 0; i < NOISE_THREADS; i++) { if (pthread_create(&noise[i], NULL, noise_thread, NULL) != 0) err(1, "Failed to create noise thread"); pthread_setname_np(noise[i], "f_noise"); } /* block here */ pthread_join(waker, NULL); pthread_join(waiter, NULL); pthread_join(rt, NULL); pthread_join(boosted, NULL); } --8<---------------cut here---------------end--------------->8---