Hi Maciej W. Rozycki, Greeting! We tested intel internal Linux next kernel by syzkaller fuzz testing and met "CPU soft lockup in __run_timers" problem in guest. Bisect and find that the following patch is the first bad commit. Check the commit content is same as following patch. All detailed info: https://github.com/xupengfe/syzkaller_logs/tree/main/240313_174939___run_timers Syzkaller reproduced code: https://github.com/xupengfe/syzkaller_logs/blob/main/240313_174939___run_timers/repro.c Syzkaller reproduced steps: https://github.com/xupengfe/syzkaller_logs/blob/main/240313_174939___run_timers/repro.prog Bisect info: https://github.com/xupengfe/syzkaller_logs/blob/main/240313_174939___run_timers/bisect_info.log Issue dmesg: https://github.com/xupengfe/syzkaller_logs/blob/main/240313_174939___run_timers/0a53950322bc80aeebf56f5a9d38c847186a082a_dmesg.log bzImage: https://github.com/xupengfe/syzkaller_logs/raw/main/240313_174939___run_timers/bzImage_v6.8_problem.tar.gz " [ 34.599994] hrtimer: interrupt took 63789 ns [ 102.443957] clocksource: Long readout interval, skipping watchdog check: cs_nsec: 3343956725 wd_nsec: 3343956627 [ 393.043326] watchdog: BUG: soft lockup - CPU#1 stuck for 27s! [repro:2653] [ 393.043704] Modules linked in: [ 393.043856] irq event stamp: 17486443 [ 393.044019] hardirqs last enabled at (17486442): [<ffffffff855b0ebe>] irqentry_exit+0x3e/0xa0 [ 393.044462] hardirqs last disabled at (17486443): [<ffffffff855aed94>] sysvec_apic_timer_interrupt+0x14/0xc0 [ 393.044925] softirqs last enabled at (14336234): [<ffffffff8126c828>] __irq_exit_rcu+0xa8/0x110 [ 393.045332] softirqs last disabled at (14336237): [<ffffffff8126c828>] __irq_exit_rcu+0xa8/0x110 [ 393.045740] CPU: 1 PID: 2653 Comm: repro Not tainted 6.8.0-0a53950322bc+ #1 [ 393.046062] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 393.046618] RIP: 0010:preempt_count_sub+0x6c/0x160 [ 393.046856] Code: ed 07 85 c9 75 1b 65 8b 05 d9 7b d2 7e 89 c2 81 e2 ff ff ff 7f 39 da 7c 1b 81 fb fe 00 00 00 76 6b f7 db 65 01 1d bc 7b d2 7e <48> 8b 5d f8 c9 c3 cc cc cc cc e8 d5 36 9d 01 85 c0 74 ed 48 c7 c0 [ 393.047696] RSP: 0018:ffff88806cd09da0 EFLAGS: 00000213 [ 393.047950] RAX: 0000000000000102 RBX: 00000000ffffffff RCX: 0000000000000000 [ 393.048281] RDX: 0000000000000102 RSI: 0000000000000102 RDI: 0000000000000001 [ 393.048611] RBP: ffff88806cd09da8 R08: 0000000000000001 R09: fffffbfff11caa46 [ 393.048943] R10: ffffffff88e55237 R11: 0000000000000001 R12: ffff88806cd32a00 [ 393.049277] R13: ffffffff81516e90 R14: dffffc0000000000 R15: ffff88806cd32a00 [ 393.049611] FS: 0000000000000000(0000) GS:ffff88806cd00000(0000) knlGS:0000000000000000 [ 393.049987] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 393.050261] CR2: 00007febf5bfa838 CR3: 000000000bda4006 CR4: 0000000000770ef0 [ 393.050596] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 393.050928] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400 [ 393.051262] PKRU: 55555554 [ 393.051396] Call Trace: [ 393.051521] <IRQ> [ 393.051630] ? show_regs+0xa9/0xc0 [ 393.051805] ? watchdog_timer_fn+0x531/0x6b0 [ 393.052019] ? __pfx_watchdog_timer_fn+0x10/0x10 [ 393.052255] ? __hrtimer_run_queues+0x1a4/0xc00 [ 393.052505] ? __pfx___hrtimer_run_queues+0x10/0x10 [ 393.052770] ? hrtimer_interrupt+0x324/0x7a0 [ 393.052994] ? __sysvec_apic_timer_interrupt+0x105/0x3c0 [ 393.053254] ? sysvec_apic_timer_interrupt+0x4b/0xc0 [ 393.053495] ? asm_sysvec_apic_timer_interrupt+0x1f/0x30 [ 393.053757] ? __pfx_clocksource_watchdog+0x10/0x10 [ 393.054002] ? preempt_count_sub+0x6c/0x160 [ 393.054213] _raw_spin_unlock_irq+0x3c/0x60 [ 393.054423] __run_timers.part.0+0x6c9/0xa40 [ 393.054647] ? __pfx___run_timers.part.0+0x10/0x10 [ 393.054883] ? sysvec_irq_work+0x8b/0x100 [ 393.055087] ? asm_sysvec_irq_work+0x1f/0x30 [ 393.055314] run_timer_softirq+0xbc/0x1c0 [ 393.055517] __do_softirq+0x1cb/0x84a [ 393.055709] __irq_exit_rcu+0xa8/0x110 [ 393.055899] irq_exit_rcu+0x12/0x30 [ 393.056078] sysvec_apic_timer_interrupt+0xa5/0xc0 [ 393.056314] </IRQ> [ 393.056422] <TASK> [ 393.056532] asm_sysvec_apic_timer_interrupt+0x1f/0x30 [ 393.056781] RIP: 0010:__schedule+0xbe/0x30c0 [ 393.056993] Code: c0 e8 e6 a8 fd ff 48 98 48 8d 3c c5 a0 dc 75 86 48 89 fa 48 c1 ea 03 42 80 3c 2a 00 0f 85 81 28 00 00 4c 03 24 c5 a0 dc 75 86 <49> 8d 84 24 58 0a 00 00 48 89 c2 48 89 85 e0 fe ff ff 48 b8 00 00 [ 393.057847] RSP: 0018:ffff888024f3f6e8 EFLAGS: 00000282 [ 393.058101] RAX: 0000000000000001 RBX: 0000000000046380 RCX: 0000000000000001 [ 393.058437] RDX: 1ffffffff0cebb95 RSI: 0000000000000001 RDI: ffffffff8675dca8 [ 393.058771] RBP: ffff888024f3f820 R08: 0000000000000001 R09: 0000000000000002 [ 393.059107] R10: 0000000000406000 R11: 0000000000000001 R12: ffff88806cd46380 [ 393.059442] R13: dffffc0000000000 R14: ffffffff855da674 R15: 0000000000406000 [ 393.059779] ? __cond_resched+0x24/0x30 [ 393.059982] ? __schedule+0x9a/0x30c0 [ 393.060173] ? lockdep_hardirqs_on+0x8a/0x110 [ 393.060408] ? trace_hardirqs_on+0x26/0x120 [ 393.060637] ? __pfx___schedule+0x10/0x10 [ 393.060836] ? __this_cpu_preempt_check+0x21/0x30 [ 393.061068] ? lock_release+0x417/0x7e0 [ 393.061267] ? __this_cpu_preempt_check+0x21/0x30 [ 393.061497] ? lock_is_held_type+0xf0/0x150 [ 393.061705] ? __cond_resched+0x24/0x30 [ 393.061895] preempt_schedule_common+0x4a/0xd0 [ 393.062118] __cond_resched+0x24/0x30 [ 393.062302] unmap_page_range+0xab4/0x3690 [ 393.062530] ? __pfx_unmap_page_range+0x10/0x10 [ 393.062752] ? __this_cpu_preempt_check+0x21/0x30 [ 393.062989] ? uprobe_munmap+0xb0/0x590 [ 393.063186] unmap_single_vma+0x1ac/0x2d0 [ 393.063390] unmap_vmas+0x210/0x470 [ 393.063572] ? __pfx_unmap_vmas+0x10/0x10 [ 393.063769] ? __pfx_lock_release+0x10/0x10 [ 393.063977] ? lock_release+0x417/0x7e0 [ 393.064167] ? __pfx_folio_batch_move_lru+0x10/0x10 [ 393.064410] ? __pfx_lock_release+0x10/0x10 [ 393.064624] ? mlock_drain_local+0x281/0x4b0 [ 393.064844] exit_mmap+0x19b/0xac0 [ 393.065018] ? mark_lock.part.0+0xf3/0x17a0 [ 393.065233] ? __pfx_exit_mmap+0x10/0x10 [ 393.065427] ? __kasan_check_write+0x18/0x20 [ 393.065644] ? __pfx___mutex_unlock_slowpath+0x10/0x10 [ 393.065909] ? mutex_unlock+0x16/0x20 [ 393.066096] __mmput+0xde/0x3e0 [ 393.066256] mmput+0x74/0x90 [ 393.066407] do_exit+0xa59/0x28c0 [ 393.066580] ? lock_release+0x417/0x7e0 [ 393.066778] ? __pfx_lock_release+0x10/0x10 [ 393.066989] ? __pfx_do_exit+0x10/0x10 [ 393.067179] ? __this_cpu_preempt_check+0x21/0x30 [ 393.067410] ? _raw_spin_unlock_irq+0x2c/0x60 [ 393.067630] ? lockdep_hardirqs_on+0x8a/0x110 [ 393.067845] ? _raw_spin_unlock_irq+0x2c/0x60 [ 393.068064] ? trace_hardirqs_on+0x26/0x120 [ 393.068287] do_group_exit+0xe5/0x2c0 [ 393.068489] __x64_sys_exit_group+0x4d/0x60 [ 393.068707] do_syscall_64+0x73/0x150 [ 393.068894] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 393.069140] RIP: 0033:0x7febf5b18a4d [ 393.069319] Code: Unable to access opcode bytes at 0x7febf5b18a23. " --- If you don't need the following environment to reproduce the problem or if you already have one reproduced environment, please ignore the following information. How to reproduce: git clone https://gitlab.com/xupengfe/repro_vm_env.git cd repro_vm_env tar -xvf repro_vm_env.tar.gz cd repro_vm_env; ./start3.sh // it needs qemu-system-x86_64 and I used v7.1.0 // start3.sh will load bzImage_2241ab53cbb5cdb08a6b2d4688feb13971058f65 v6.2-rc5 kernel // You could change the bzImage_xxx as you want // Maybe you need to remove line "-drive if=pflash,format=raw,readonly=on,file=./OVMF_CODE.fd \" for different qemu version You could use below command to log in, there is no password for root. ssh -p 10023 root@localhost After login vm(virtual machine) successfully, you could transfer reproduced binary to the vm by below way, and reproduce the problem in vm: gcc -pthread -o repro repro.c scp -P 10023 repro root@localhost:/root/ Get the bzImage for target kernel: Please use target kconfig and copy it to kernel_src/.config make olddefconfig make -jx bzImage //x should equal or less than cpu num your pc has Fill the bzImage file into above start3.sh to load the target kernel in vm. Tips: If you already have qemu-system-x86_64, please ignore below info. If you want to install qemu v7.1.0 version: git clone https://github.com/qemu/qemu.git cd qemu git checkout -f v7.1.0 mkdir build cd build yum install -y ninja-build.x86_64 yum -y install libslirp-devel.x86_64 ../configure --target-list=x86_64-softmmu --enable-kvm --enable-vnc --enable-gtk --enable-sdl --enable-usb-redir --enable-slirp make make install --- Best Regards, Thanks! On 2024-02-10 at 01:43:50 +0000, Maciej W. Rozycki wrote: > Only return successful completion status from `pcie_failed_link_retrain' > if retraining has actually been done, preventing excessive delays from > being triggered at call sites in a hope that communication will finally > be established with the downstream device where in fact nothing has been > done about the link in question that would justify such a hope. > > Fixes: a89c82249c37 ("PCI: Work around PCIe link training failures") > Reported-by: Ilpo Järvinen <ilpo.jarvinen@xxxxxxxxxxxxxxx> > Link: https://lore.kernel.org/r/aa2d1c4e-9961-d54a-00c7-ddf8e858a9b0@xxxxxxxxxxxxxxx/ > Signed-off-by: Maciej W. Rozycki <macro@xxxxxxxxxxx> > Cc: stable@xxxxxxxxxxxxxxx # v6.5+ > --- > drivers/pci/quirks.c | 18 +++++++++++------- > 1 file changed, 11 insertions(+), 7 deletions(-) > > linux-pcie-failed-link-retrain-status-fix.diff > Index: linux-macro/drivers/pci/quirks.c > =================================================================== > --- linux-macro.orig/drivers/pci/quirks.c > +++ linux-macro/drivers/pci/quirks.c > @@ -74,7 +74,8 @@ > * firmware may have already arranged and lift it with ports that already > * report their data link being up. > * > - * Return TRUE if the link has been successfully retrained, otherwise FALSE. > + * Return TRUE if the link has been successfully retrained, otherwise FALSE, > + * also when retraining was not needed in the first place. > */ > bool pcie_failed_link_retrain(struct pci_dev *dev) > { > @@ -83,10 +84,11 @@ bool pcie_failed_link_retrain(struct pci > {} > }; > u16 lnksta, lnkctl2; > + bool ret = false; > > if (!pci_is_pcie(dev) || !pcie_downstream_port(dev) || > !pcie_cap_has_lnkctl2(dev) || !dev->link_active_reporting) > - return false; > + return ret; > > pcie_capability_read_word(dev, PCI_EXP_LNKCTL2, &lnkctl2); > pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta); > @@ -98,9 +100,10 @@ bool pcie_failed_link_retrain(struct pci > lnkctl2 |= PCI_EXP_LNKCTL2_TLS_2_5GT; > pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2); > > - if (pcie_retrain_link(dev, false)) { > + ret = pcie_retrain_link(dev, false) == 0; > + if (!ret) { > pci_info(dev, "retraining failed\n"); > - return false; > + return ret; > } > > pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta); > @@ -117,13 +120,14 @@ bool pcie_failed_link_retrain(struct pci > lnkctl2 |= lnkcap & PCI_EXP_LNKCAP_SLS; > pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2); > > - if (pcie_retrain_link(dev, false)) { > + ret = pcie_retrain_link(dev, false) == 0; > + if (!ret) { > pci_info(dev, "retraining failed\n"); > - return false; > + return ret; > } > } > > - return true; > + return ret; > } > > static ktime_t fixup_debug_start(struct pci_dev *dev,