A hard lockup has been observed with long running KVM guests that use periodic timers: UPTIME: 723 days, 17:47:41 PID: 514319 TASK: ffff99ed8d709070 CPU: 62 COMMAND: "CPU 0/KVM" #0 [ffff9a663fb08980] machine_kexec at ffffffff84065b24 #1 [ffff9a663fb089e0] __crash_kexec at ffffffff84122342 #2 [ffff9a663fb08ab0] panic at ffffffff84774972 #3 [ffff9a663fb08b30] nmi_panic at ffffffff8409b6af #4 [ffff9a663fb08b40] watchdog_overflow_callback at ffffffff8414eb81 #5 [ffff9a663fb08b58] __perf_event_overflow at ffffffff841a8277 #6 [ffff9a663fb08b90] perf_event_overflow at ffffffff841b1a14 #7 [ffff9a663fb08ba0] handle_pmi_common at ffffffff8400ac70 #8 [ffff9a663fb08de0] intel_pmu_handle_irq at ffffffff8400af4f #9 [ffff9a663fb08e38] perf_event_nmi_handler at ffffffff84784031 #10 [ffff9a663fb08e58] nmi_handle at ffffffff8478593c #11 [ffff9a663fb08eb0] do_nmi at ffffffff84785b5d #12 [ffff9a663fb08ef0] end_repeat_nmi at ffffffff84784d9c [exception RIP: rb_erase+849] RIP: ffffffff84389a91 RSP: ffff9a663fb03ec8 RFLAGS: 00000046 RAX: ffff9a663fb15f40 RBX: ffff9a8e3daa5610 RCX: 0000000000000000 RDX: ffff9a663fb15f40 RSI: ffff9a663fb159b0 RDI: ffff9a8e3daa5610 RBP: ffff9a663fb03ec8 R8: 0000000000000000 R9: 0000000000000000 R10: 00000000000000e0 R11: 0000000000000000 R12: ffff9a663fb159b0 R13: ffff9a663fb15960 R14: 0000000000000000 R15: ffff9a663fb15a98 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 --- <NMI exception stack> --- #13 [ffff9a663fb03ec8] rb_erase at ffffffff84389a91 #14 [ffff9a663fb03ed0] timerqueue_del at ffffffff8438bfe4 #15 [ffff9a663fb03ef0] __remove_hrtimer at ffffffff840ca08f #16 [ffff9a663fb03f20] __hrtimer_run_queues at ffffffff840ca5c5 #17 [ffff9a663fb03f78] hrtimer_interrupt at ffffffff840cab4f #18 [ffff9a663fb03fc0] local_apic_timer_interrupt at ffffffff8405c60b #19 [ffff9a663fb03fd8] smp_apic_timer_interrupt at ffffffff847929d3 #20 [ffff9a663fb03ff0] apic_timer_interrupt at ffffffff8478eefa apic_timer_fn rearms itself sufficient times to cause the hardlockup detector to trigger. The problem comes from how the hrtimer expiration time is advanced, which does not match how the interrupts at separated in time: Timer interrupt handlers get to execute a few microseconds after their deadline, while the exact period is added to the hrtimer expiration time. With large uptimes, this causes the hrtimer expiration time to become much smaller than the actual timer interrupt base: phys time 0 P 2P 3P ... | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | timer_int 0.1 P+0.2 P+0.3 hrtimer_exp P 2P 3P At timer_int at point 0.1: hrtimer_exp = hrtimer_exp + period = P At timer_int at point P+0.2: hrtimer_exp = hrtimer_exp + period = 2P At timer_int at point P+0.3: hrtimer_exp = hrtimer_exp + period = 3P To confirm, printing the deltas of timer interrupt and timer expiration: timer_int hrtimer_exp 1.0e6 1.00006e6 1.00009e6 1.00006e6 1.00002e6 1.00006e6 1.0001e6 1.00006e6 1.00013e6 1.00006e6 1.0005e6 1.00006e6 1.01135e6 1.00006e6 991227.0 1.00006e6 1.00046e6 1.00006e6 1.00388e6 1.00006e6 1.01399e6 1.00006e6 ... To fix this, for the hrtimer expire value, force a minimum of "now - period". Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index baca9fa37a91..52ba848d6828 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2442,8 +2442,19 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) apic_timer_expired(apic, true); if (lapic_is_periodic(apic)) { + ktime_t now, expires, leftlimit; + advance_periodic_target_expiration(apic); hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + + /* Advance timer if its far behind */ + now = ktime_get(); + expires = hrtimer_get_expires(&ktimer->timer); + leftlimit = ktime_sub_ns(now, ktimer->period); + + if (ktimer->period > 0 && ktime_before(expires, leftlimit)) + hrtimer_set_expires(&ktimer->timer, leftlimit); + return HRTIMER_RESTART; } else return HRTIMER_NORESTART;