Hello, kernel test robot noticed "WARNING:at_kernel/smp.c:#smp_call_function_single" on: commit: 9139f93209d1ffd7f489ab19dee01b7c3a1a43d2 ("rcu/nocb: Fix RT throttling hrtimer armed from offline CPU") https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master [test failed on linus/master 1868f9d0260e9afaf7c6436d14923ae12eaea465] [test failed on linux-next/master 62f92d634458a1e308bb699986b9147a6d670457] in testcase: rcutorture version: with following parameters: runtime: 300s test: cpuhotplug torture_type: rcu compiler: gcc-12 test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G (please refer to attached dmesg/kmsg for entire log/backtrace) we noticed the issue doesn't always happen. 70 out of 200 runs as below. but keeps clean on parent. 1fcb932c8b5ce862 9139f93209d1ffd7f489ab19dee ---------------- --------------------------- fail:runs %reproduction fail:runs | | | :200 35% 70:200 dmesg.RIP:multi_cpu_stop :200 35% 70:200 dmesg.RIP:smp_call_function_single :200 35% 70:200 dmesg.WARNING:at_kernel/smp.c:#smp_call_function_single If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@xxxxxxxxx> | Closes: https://lore.kernel.org/oe-lkp/202409231644.4c55582d-lkp@xxxxxxxxx [ 174.242695][ C1] ------------[ cut here ]------------ [ 174.243292][ C1] WARNING: CPU: 1 PID: 26 at kernel/smp.c:633 smp_call_function_single (kernel/smp.c:633 (discriminator 1)) [ 174.243960][ C1] Modules linked in: rcutorture torture [ 174.244381][ C1] CPU: 1 UID: 0 PID: 26 Comm: migration/1 Not tainted 6.11.0-rc1-00012-g9139f93209d1 #1 [ 174.245082][ C1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 174.245867][ C1] Stopper: multi_cpu_stop+0x0/0x320 <- __stop_cpus+0xd0/0x120 [ 174.246506][ C1] RIP: 0010:smp_call_function_single (kernel/smp.c:633 (discriminator 1)) [ 174.246978][ C1] Code: d0 7c 08 84 d2 0f 85 a8 00 00 00 8b 05 74 42 fd 0a 85 c0 0f 85 51 fe ff ff 0f 0b e9 4a fe ff ff 0f 1f 44 00 00 e9 60 ff ff ff <0f> 0b e9 4b fe ff ff 48 89 74 24 28 e8 ca 15 37 00 48 8b 74 24 28 All code ======== 0: d0 7c 08 84 sarb -0x7c(%rax,%rcx,1) 4: d2 0f rorb %cl,(%rdi) 6: 85 a8 00 00 00 8b test %ebp,-0x75000000(%rax) c: 05 74 42 fd 0a add $0xafd4274,%eax 11: 85 c0 test %eax,%eax 13: 0f 85 51 fe ff ff jne 0xfffffffffffffe6a 19: 0f 0b ud2 1b: e9 4a fe ff ff jmp 0xfffffffffffffe6a 20: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 25: e9 60 ff ff ff jmp 0xffffffffffffff8a 2a:* 0f 0b ud2 <-- trapping instruction 2c: e9 4b fe ff ff jmp 0xfffffffffffffe7c 31: 48 89 74 24 28 mov %rsi,0x28(%rsp) 36: e8 ca 15 37 00 call 0x371605 3b: 48 8b 74 24 28 mov 0x28(%rsp),%rsi Code starting with the faulting instruction =========================================== 0: 0f 0b ud2 2: e9 4b fe ff ff jmp 0xfffffffffffffe52 7: 48 89 74 24 28 mov %rsi,0x28(%rsp) c: e8 ca 15 37 00 call 0x3715db 11: 48 8b 74 24 28 mov 0x28(%rsp),%rsi [ 174.248359][ C1] RSP: 0000:ffff8883ae709a60 EFLAGS: 00010006 [ 174.252935][ C1] RAX: 0000000080000103 RBX: 1ffff11075ce1354 RCX: ffffffff814a8d90 [ 174.253513][ C1] RDX: fffffbfff14b9c52 RSI: 0000000000000008 RDI: ffffffff8a5ce288 [ 174.254094][ C1] RBP: ffff8883ae709b38 R08: 0000000000000000 R09: fffffbfff14b9c51 [ 174.254670][ C1] R10: ffffffff8a5ce28f R11: ffff8881000406c8 R12: dffffc0000000000 [ 174.255274][ C1] R13: 0000000000000001 R14: ffffffff814048b0 R15: 0000000000000000 [ 174.255853][ C1] FS: 0000000000000000(0000) GS:ffff8883ae700000(0000) knlGS:0000000000000000 [ 174.256669][ C1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 174.257150][ C1] CR2: 0000000000000000 CR3: 0000000008af1000 CR4: 00000000000406b0 [ 174.257727][ C1] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 174.258325][ C1] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 174.258921][ C1] Call Trace: [ 174.259177][ C1] <IRQ> [ 174.259399][ C1] ? __warn (kernel/panic.c:735) [ 174.259714][ C1] ? smp_call_function_single (kernel/smp.c:633 (discriminator 1)) [ 174.260143][ C1] ? report_bug (lib/bug.c:180 lib/bug.c:219) [ 174.260504][ C1] ? handle_bug (arch/x86/kernel/traps.c:239) [ 174.260821][ C1] ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1)) [ 174.261168][ C1] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621) [ 174.261532][ C1] ? check_slow_task (kernel/rcu/tree.c:1054) [ 174.261903][ C1] ? smp_call_function_single (arch/x86/include/asm/bitops.h:227 arch/x86/include/asm/bitops.h:239 include/asm-generic/bitops/instrumented-non-atomic.h:142 include/linux/cpumask.h:562 include/linux/cpumask.h:1105 kernel/smp.c:624) [ 174.262319][ C1] ? smp_call_function_single (kernel/smp.c:633 (discriminator 1)) [ 174.262736][ C1] ? reacquire_held_locks (kernel/locking/lockdep.c:5410) [ 174.263131][ C1] ? do_raw_spin_unlock (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/atomic/atomic-instrumented.h:33 include/asm-generic/qspinlock.h:57 kernel/locking/spinlock_debug.c:101 kernel/locking/spinlock_debug.c:141) [ 174.263508][ C1] ? generic_exec_single (kernel/smp.c:604) [ 174.263897][ C1] ? trace_rcu_nocb_wake (arch/x86/include/asm/bitops.h:227 (discriminator 41) arch/x86/include/asm/bitops.h:239 (discriminator 41) include/asm-generic/bitops/instrumented-non-atomic.h:142 (discriminator 41) include/linux/cpumask.h:562 (discriminator 41) include/linux/cpumask.h:1105 (discriminator 41) include/trace/events/rcu.h:284 (discriminator 41)) [ 174.264285][ C1] swake_up_one_online (arch/x86/include/asm/preempt.h:94 kernel/rcu/tree.c:1078) [ 174.264662][ C1] __call_rcu_nocb_wake (kernel/rcu/tree_nocb.h:564) [ 174.265048][ C1] ? rcu_advance_cbs_nowake (kernel/rcu/tree_nocb.h:532) [ 174.265460][ C1] ? rcu_segcblist_enqueue (arch/x86/include/asm/atomic64_64.h:25 include/linux/atomic/atomic-arch-fallback.h:2672 include/linux/atomic/atomic-long.h:121 include/linux/atomic/atomic-instrumented.h:3261 kernel/rcu/rcu_segcblist.c:214 kernel/rcu/rcu_segcblist.c:231 kernel/rcu/rcu_segcblist.c:332) [ 174.265860][ C1] ? rcu_torture_reader_do_mbchk (kernel/rcu/rcutorture.c:1726) rcutorture [ 174.266394][ C1] __call_rcu_common (kernel/rcu/tree_nocb.h:606 kernel/rcu/tree.c:3094) [ 174.266770][ C1] ? dyntick_save_progress_counter (kernel/rcu/tree.c:3051) [ 174.267225][ C1] ? kasan_addr_to_slab (arch/x86/include/asm/page_64.h:26 include/linux/mm.h:1283 mm/kasan/../slab.h:206 mm/kasan/common.c:38) [ 174.267599][ C1] ? __kasan_kmalloc (mm/kasan/common.c:370 mm/kasan/common.c:387) [ 174.267953][ C1] ? rcu_torture_one_read (kernel/rcu/rcutorture.c:2073) rcutorture [ 174.268444][ C1] call_timer_fn (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/jump_label.h:261 include/linux/jump_label.h:273 include/trace/events/timer.h:127 kernel/time/timer.c:1793) [ 174.268789][ C1] ? try_to_del_timer_sync (kernel/time/timer.c:1769) [ 174.269190][ C1] __run_timers (kernel/time/timer.c:1844 kernel/time/timer.c:2417) [ 174.269522][ C1] ? rcu_torture_one_read (kernel/rcu/rcutorture.c:2073) rcutorture [ 174.270000][ C1] ? call_timer_fn (kernel/time/timer.c:2388) [ 174.270347][ C1] ? run_timer_softirq (kernel/time/timer.c:2428 kernel/time/timer.c:2437 kernel/time/timer.c:2445) [ 174.270711][ C1] ? lock_sync (kernel/locking/lockdep.c:5727) [ 174.271046][ C1] ? spin_bug (kernel/locking/spinlock_debug.c:114) [ 174.271366][ C1] run_timer_softirq (kernel/time/timer.c:2429 kernel/time/timer.c:2437 kernel/time/timer.c:2445) [ 174.271720][ C1] ? __run_timers (kernel/time/timer.c:2444) [ 174.272072][ C1] ? lockdep_hardirqs_on_prepare (kernel/locking/lockdep.c:4291 kernel/locking/lockdep.c:4358) [ 174.272515][ C1] handle_softirqs (arch/x86/include/asm/atomic.h:23 include/linux/atomic/atomic-arch-fallback.h:457 include/linux/jump_label.h:261 include/linux/jump_label.h:273 include/trace/events/irq.h:142 kernel/softirq.c:555) [ 174.272752][ C1] ? _local_bh_enable (kernel/softirq.c:512) [ 174.272985][ C1] ? tick_handle_periodic (kernel/time/tick-common.c:132) [ 174.273234][ C1] irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637 kernel/softirq.c:627 kernel/softirq.c:649) [ 174.273442][ C1] sysvec_apic_timer_interrupt (arch/x86/kernel/apic/apic.c:1043 arch/x86/kernel/apic/apic.c:1043) [ 174.273709][ C1] </IRQ> [ 174.273847][ C1] <TASK> [ 174.273984][ C1] asm_sysvec_apic_timer_interrupt (arch/x86/include/asm/idtentry.h:702) [ 174.274260][ C1] RIP: 0010:multi_cpu_stop (kernel/stop_machine.c:259) [ 174.274513][ C1] Code: 8b 44 24 0c 41 89 47 20 e8 67 a2 f3 ff 83 fb 04 0f 85 0f ff ff ff 48 8b 5c 24 20 80 e7 02 74 06 e8 df 8c 09 00 fb 8b 44 24 14 <48> 83 c4 30 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 c7 c0 88 e2 5c 8a All code ======== 0: 8b 44 24 0c mov 0xc(%rsp),%eax 4: 41 89 47 20 mov %eax,0x20(%r15) 8: e8 67 a2 f3 ff call 0xfffffffffff3a274 d: 83 fb 04 cmp $0x4,%ebx 10: 0f 85 0f ff ff ff jne 0xffffffffffffff25 16: 48 8b 5c 24 20 mov 0x20(%rsp),%rbx 1b: 80 e7 02 and $0x2,%bh 1e: 74 06 je 0x26 20: e8 df 8c 09 00 call 0x98d04 25: fb sti 26: 8b 44 24 14 mov 0x14(%rsp),%eax 2a:* 48 83 c4 30 add $0x30,%rsp <-- trapping instruction 2e: 5b pop %rbx 2f: 5d pop %rbp 30: 41 5c pop %r12 32: 41 5d pop %r13 34: 41 5e pop %r14 36: 41 5f pop %r15 38: c3 ret 39: 48 c7 c0 88 e2 5c 8a mov $0xffffffff8a5ce288,%rax Code starting with the faulting instruction =========================================== 0: 48 83 c4 30 add $0x30,%rsp 4: 5b pop %rbx 5: 5d pop %rbp 6: 41 5c pop %r12 8: 41 5d pop %r13 a: 41 5e pop %r14 c: 41 5f pop %r15 e: c3 ret f: 48 c7 c0 88 e2 5c 8a mov $0xffffffff8a5ce288,%rax The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20240923/202409231644.4c55582d-lkp@xxxxxxxxx -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki