Hello John, You asked me to test the upstream kernel with a printk torture (console_blash.sh) with the different scheduler settings on a very large core system. Wanted to post to the mailing list to show what I tested with. As far as I can tell; all the scheduler settings correctly get to and trip the sysrq trigger to make it crash at the end of the script. But, with every state, except for fully pre-emeptive, I got an NMI call trace before the sysrq trigger that expectedly crashes the system. This is on a Fedora 39 Server installed on a 2x56 core, 224 thread machine in Red Hat's lab with the lscpu below with 6.7.0-rt6. I have only decoded and attached NoForcedPreemption below. Please ask if you want additional ones but they all look similar when I checked. CCing Derek who is handling the printk backport and my personal email for kernel development. NMI Call Trace for NoForcedPreemption --- task:cpuhp/222 state:S stack:0 pid:1349 tgid:1349 ppid:2 flags:0x00004000 Call Trace: <TASK> __schedule (kernel/sched/core.c:5396 kernel/sched/core.c:6708) ? try_to_wake_up (./arch/x86/include/asm/preempt.h:104 ./include/linux/preempt.h:484 ./include/linux/preempt.h:484 kernel/sched/core.c:4217) ? __pfx_smpboot_thread_fn (kernel/smpboot.c:107) schedule (kernel/sched/core.c:6784 kernel/sched/core.c:6798) smpboot_thread_fn (kernel/smpboot.c:160) kthread (kernel/kthread.c:388) ? __pfx_kthread (kernel/kthread.c:341) ret_from_fork (arch/x86/kernel/process.c:147) ? __pfx_kthread (kernel/kthread.c:341) ret_from_fork_asm (arch/x86/entry/entry_64.S:250) </TASK> sysrq: Trigger a crash Kernel panic - not syncing: sysrq triggered crash NMI backtrace for cpu 56 Hardware name: Intel Corporation D50DNP1SBB/D50DNP1SBB, BIOS SE5C7411.86B.9409.D04.2212261349 12/26/2022 RIP: 0010:io_serial_in (arch/x86/kernel/early_printk.c:100) Code: cc 0f 1f 40 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 0f b6 8f c1 00 00 00 0f b7 57 08 d3 e6 01 f2 ec <0f> b6 c0 c3 cc cc cc cc 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 All code ======== 0: cc int3 1: 0f 1f 40 00 nopl 0x0(%rax) 5: 90 nop 6: 90 nop 7: 90 nop 8: 90 nop 9: 90 nop a: 90 nop b: 90 nop c: 90 nop d: 90 nop e: 90 nop f: 90 nop 10: 90 nop 11: 90 nop 12: 90 nop 13: 90 nop 14: 90 nop 15: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 1a: 0f b6 8f c1 00 00 00 movzbl 0xc1(%rdi),%ecx 21: 0f b7 57 08 movzwl 0x8(%rdi),%edx 25: d3 e6 shl %cl,%esi 27: 01 f2 add %esi,%edx 29: ec in (%dx),%al 2a:* 0f b6 c0 movzbl %al,%eax <-- trapping instruction 2d: c3 ret 2e: cc int3 2f: cc int3 30: cc int3 31: cc int3 32: 0f 1f 00 nopl (%rax) 35: 90 nop 36: 90 nop 37: 90 nop 38: 90 nop 39: 90 nop 3a: 90 nop 3b: 90 nop 3c: 90 nop 3d: 90 nop 3e: 90 nop 3f: 90 nop Code starting with the faulting instruction =========================================== 0: 0f b6 c0 movzbl %al,%eax 3: c3 ret 4: cc int3 5: cc int3 6: cc int3 7: cc int3 8: 0f 1f 00 nopl (%rax) b: 90 nop c: 90 nop d: 90 nop e: 90 nop f: 90 nop 10: 90 nop 11: 90 nop 12: 90 nop 13: 90 nop 14: 90 nop 15: 90 nop RSP: 0018:ff4f21361ec27b28 EFLAGS: 00000002 RAX: ffffffff863d6e00 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 00000000000003fd RSI: 0000000000000005 RDI: ffffffff88ad49c0 RBP: 000000000000270d R08: ffffffff889c73ab R09: 0000000000ffff0a R10: 0000000000000005 R11: 0000000000000002 R12: 0000000000000020 R13: ffffffff88ad49c0 R14: 0000000000000000 R15: 0000000000000000 FS: 00007faac4483740(0000) GS:ff4f03763f600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007faac465f650 CR3: 0000002085d0e006 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: <NMI> ? nmi_cpu_backtrace (lib/nmi_backtrace.c:115) ? nmi_cpu_backtrace_handler (arch/x86/kernel/apic/hw_nmi.c:47 (discriminator 1)) ? nmi_handle (arch/x86/kernel/nmi.c:150) ? io_serial_in (arch/x86/kernel/early_printk.c:100) ? default_do_nmi (arch/x86/kernel/nmi.c:351) ? exc_nmi (arch/x86/kernel/nmi.c:545) ? end_repeat_nmi (arch/x86/entry/entry_64.S:1394) ? mem16_serial_out (drivers/tty/serial/8250/8250_port.c:375) ? io_serial_in (arch/x86/kernel/early_printk.c:100) ? io_serial_in (arch/x86/kernel/early_printk.c:100) ? io_serial_in (arch/x86/kernel/early_printk.c:100) </NMI> <TASK> wait_for_xmitr (drivers/tty/serial/8250/8250.h:117 drivers/tty/serial/8250/8250.h:139 drivers/tty/serial/8250/8250_port.c:2098 drivers/tty/serial/8250/8250_port.c:2116) serial8250_console_putchar (./include/linux/serial_core.h:703 drivers/tty/serial/8250/8250_port.c:3339) ? __pfx_serial8250_console_putchar (drivers/tty/serial/8250/8250_port.c:3335) uart_console_write (drivers/tty/serial/serial_core.c:2105) serial8250_console_write_atomic (drivers/tty/serial/8250/8250_port.c:3620) nbcon_emit_next_record (kernel/printk/nbcon.c:940) __nbcon_atomic_flush_all (kernel/printk/nbcon.c:1192 (discriminator 1) kernel/printk/nbcon.c:1326 (discriminator 1)) vprintk_emit (kernel/printk/printk.c:2413) _printk (kernel/printk/printk.c:2473) panic (./arch/x86/include/asm/bitops.h:206 ./arch/x86/include/asm/bitops.h:238 ./include/asm-generic/bitops/instrumented-non-atomic.h:142 kernel/panic.c:532 kernel/panic.c:343) ? _printk (kernel/printk/printk.c:2473) sysrq_handle_crash (drivers/tty/sysrq.c:154) __handle_sysrq (drivers/tty/sysrq.c:601) write_sysrq_trigger (drivers/tty/sysrq.c:1165) proc_reg_write (fs/proc/inode.c:337 fs/proc/inode.c:349) ? preempt_count_add (./include/linux/ftrace.h:974 (discriminator 1) kernel/sched/core.c:5839 (discriminator 1) kernel/sched/core.c:5836 (discriminator 1) kernel/sched/core.c:5864 (discriminator 1)) vfs_write (fs/read_write.c:582) ? __do_sys_newfstatat (fs/stat.c:466 (discriminator 1)) ksys_write (fs/read_write.c:637) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) ? syscall_exit_to_user_mode (kernel/entry/common.c:299) ? do_syscall_64 (arch/x86/entry/common.c:102) ? exc_page_fault (./arch/x86/include/asm/irqflags.h:37 ./arch/x86/include/asm/irqflags.h:72 arch/x86/mm/fault.c:1515 arch/x86/mm/fault.c:1563) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) RIP: 0033:0x7faac458ec74 Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d f5 76 0d 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 All code ======== 0: c7 00 16 00 00 00 movl $0x16,(%rax) 6: b8 ff ff ff ff mov $0xffffffff,%eax b: c3 ret c: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) 13: 00 00 00 16: f3 0f 1e fa endbr64 1a: 80 3d f5 76 0d 00 00 cmpb $0x0,0xd76f5(%rip) # 0xd7716 21: 74 13 je 0x36 23: b8 01 00 00 00 mov $0x1,%eax 28: 0f 05 syscall 2a:* 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax <-- trapping instruction 30: 77 54 ja 0x86 32: c3 ret 33: 0f 1f 00 nopl (%rax) 36: 55 push %rbp 37: 48 89 e5 mov %rsp,%rbp 3a: 48 83 ec 20 sub $0x20,%rsp 3e: 48 rex.W 3f: 89 .byte 0x89 Code starting with the faulting instruction =========================================== 0: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax 6: 77 54 ja 0x5c 8: c3 ret 9: 0f 1f 00 nopl (%rax) c: 55 push %rbp d: 48 89 e5 mov %rsp,%rbp 10: 48 83 ec 20 sub $0x20,%rsp 14: 48 rex.W 15: 89 .byte 0x89 RSP: 002b:00007ffd2e5539b8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007faac458ec74 RDX: 0000000000000002 RSI: 00005580d5ed8490 RDI: 0000000000000001 RBP: 00007ffd2e5539e0 R08: 0000000000000410 R09: 0000000000000001 R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000002 R13: 00005580d5ed8490 R14: 00007faac465f5c0 R15: 00007faac465cf20 </TASK> Hardware name: Intel Corporation D50DNP1SBB/D50DNP1SBB, BIOS SE5C7411.86B.9409.D04.2212261349 12/26/2022 Call Trace: <TASK> dump_stack_lvl (lib/dump_stack.c:107) panic (kernel/panic.c:344) ? _printk (kernel/printk/printk.c:2473) sysrq_handle_crash (drivers/tty/sysrq.c:154) __handle_sysrq (drivers/tty/sysrq.c:601) write_sysrq_trigger (drivers/tty/sysrq.c:1165) proc_reg_write (fs/proc/inode.c:337 fs/proc/inode.c:349) ? preempt_count_add (./include/linux/ftrace.h:974 (discriminator 1) kernel/sched/core.c:5839 (discriminator 1) kernel/sched/core.c:5836 (discriminator 1) kernel/sched/core.c:5864 (discriminator 1)) vfs_write (fs/read_write.c:582) ? __do_sys_newfstatat (fs/stat.c:466 (discriminator 1)) ksys_write (fs/read_write.c:637) do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) ? syscall_exit_to_user_mode (kernel/entry/common.c:299) ? do_syscall_64 (arch/x86/entry/common.c:102) ? exc_page_fault (./arch/x86/include/asm/irqflags.h:37 ./arch/x86/include/asm/irqflags.h:72 arch/x86/mm/fault.c:1515 arch/x86/mm/fault.c:1563) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) RIP: 0033:0x7faac458ec74 Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d f5 76 0d 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 All code ======== 0: c7 00 16 00 00 00 movl $0x16,(%rax) 6: b8 ff ff ff ff mov $0xffffffff,%eax b: c3 ret c: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) 13: 00 00 00 16: f3 0f 1e fa endbr64 1a: 80 3d f5 76 0d 00 00 cmpb $0x0,0xd76f5(%rip) # 0xd7716 21: 74 13 je 0x36 23: b8 01 00 00 00 mov $0x1,%eax 28: 0f 05 syscall 2a:* 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax <-- trapping instruction 30: 77 54 ja 0x86 32: c3 ret 33: 0f 1f 00 nopl (%rax) 36: 55 push %rbp 37: 48 89 e5 mov %rsp,%rbp 3a: 48 83 ec 20 sub $0x20,%rsp 3e: 48 rex.W 3f: 89 .byte 0x89 Code starting with the faulting instruction =========================================== 0: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax 6: 77 54 ja 0x5c 8: c3 ret 9: 0f 1f 00 nopl (%rax) c: 55 push %rbp d: 48 89 e5 mov %rsp,%rbp 10: 48 83 ec 20 sub $0x20,%rsp 14: 48 rex.W 15: 89 .byte 0x89 RSP: 002b:00007ffd2e5539b8 EFLAGS: 00000202 RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007faac458ec74 RDX: 0000000000000002 RSI: 00005580d5ed8490 RDI: 0000000000000001 RBP: 00007ffd2e5539e0 R08: 0000000000000410 R09: 0000000000000001 R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000002 R13: 00005580d5ed8490 R14: 00007faac465f5c0 R15: 00007faac465cf20 </TASK> Shutting down cpus with NMI Kernel Offset: 0x4c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: sysrq triggered crash ]--- --- lscpu --- Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Address sizes: 52 bits physical, 57 bits virtual Byte Order: Little Endian CPU(s): 224 On-line CPU(s) list: 0-223 Vendor ID: GenuineIntel BIOS Vendor ID: Intel(R) Corporation Model name: Intel (R) Xeon (R) CPU Max 9480 BIOS Model name: Intel (R) Xeon (R) CPU Max 9480 CPU @ 1.9GHz BIOS CPU family: 179 CPU family: 6 Model: 143 Thread(s) per core: 2 Core(s) per socket: 56 Socket(s): 2 Stepping: 8 CPU(s) scaling MHz: 23% CPU max MHz: 3500.0000 CPU min MHz: 800.0000 BogoMIPS: 3800.00 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xt pr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 intel_ ppin cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx 512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mb m_local split_lock_detect user_shstk avx_vnni avx512_bf16 wbnoinvd dtherm ida arat pln pts hwp hwp_act_window hwp_epp hwp_pkg_req vnmi avx512vbmi umip pku ospke wait pkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq la57 rdpid bus_lock_detect cldemote movdiri movdir64b enqcmd fsrm md_clear seria lize tsxldtrk pconfig arch_lbr ibt amx_bf16 avx512_fp16 amx_tile amx_int8 flush_l1d arch_capabilities Virtualization features: Virtualization: VT-x Caches (sum of all): L1d: 5.3 MiB (112 instances) L1i: 3.5 MiB (112 instances) L2: 224 MiB (112 instances) L3: 225 MiB (2 instances) NUMA: NUMA node(s): 16 NUMA node0 CPU(s): 0-13,112-125 NUMA node1 CPU(s): 14-27,126-139 NUMA node2 CPU(s): 28-41,140-153 NUMA node3 CPU(s): 42-55,154-167 NUMA node4 CPU(s): 56-69,168-181 NUMA node5 CPU(s): 70-83,182-195 NUMA node6 CPU(s): 84-97,196-209 NUMA node7 CPU(s): 98-111,210-223 NUMA node8 CPU(s): NUMA node9 CPU(s): NUMA node10 CPU(s): NUMA node11 CPU(s): NUMA node12 CPU(s): NUMA node13 CPU(s): NUMA node14 CPU(s): NUMA node15 CPU(s): Vulnerabilities: Gather data sampling: Not affected Itlb multihit: Not affected L1tf: Not affected Mds: Not affected Meltdown: Not affected Mmio stale data: Not affected Retbleed: Not affected Spec rstack overflow: Not affected Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Spectre v2: Mitigation; Enhanced / Automatic IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence Srbds: Not affected Tsx async abort: Not affected --- console_blast.sh --- #!/bin/sh set -e # This script is intended to be started with no arguments. # # It then calls itself to create a pinned process for each CPU. Those # child processes will run in infinite loops of show-task-states via # /proc/sysrq-trigger. This generates lots of contention on the console. # # Finally, it waits 10 seconds before crashing the system. if [ $# -eq 0 ]; then cpus=$(($(nproc) - 1)) for i in $(seq 0 $cpus); do $0 $i & done sleep 15 echo c > /proc/sysrq-trigger exit 0 fi # Setup the CPU mask for the provided CPU number. The real taskset(1) # could simply use -c, but the busybox(1) variant does not support this. mask=$(printf "0x%x\n" $((1 << $1))) taskset -p $mask $$ # Wait a moment before beginning blast. sleep 2 while true; do echo t > /proc/sysrq-trigger done ---