irq_work's use of the DEC SPR is racy with guest<->host switch and guest entry which flips the DEC interrupt to guest, which could lose a host work interrupt. This patch closes one race, and attempts to comment another class of races. Signed-off-by: Nicholas Piggin <npiggin@xxxxxxxxx> --- arch/powerpc/include/asm/time.h | 12 ++++++++++++ arch/powerpc/kernel/time.c | 10 ---------- arch/powerpc/kvm/book3s_hv.c | 15 +++++++++++++++ 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 8dd3cdb25338..8c2c3dd4ddba 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -97,6 +97,18 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low, extern void secondary_cpu_time_init(void); extern void __init time_init(void); +#ifdef CONFIG_PPC64 +static inline unsigned long test_irq_work_pending(void) +{ + unsigned long x; + + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, irq_work_pending))); + return x; +} +#endif + DECLARE_PER_CPU(u64, decrementers_next_tb); /* Convert timebase ticks to nanoseconds */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index b67d93a609a2..da995c5fb97d 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -508,16 +508,6 @@ EXPORT_SYMBOL(profile_pc); * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... */ #ifdef CONFIG_PPC64 -static inline unsigned long test_irq_work_pending(void) -{ - unsigned long x; - - asm volatile("lbz %0,%1(13)" - : "=r" (x) - : "i" (offsetof(struct paca_struct, irq_work_pending))); - return x; -} - static inline void set_irq_work_pending_flag(void) { asm volatile("stb %0,%1(13)" : : diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 48df339affdf..bdc24e9cb096 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3708,6 +3708,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, if (!(vcpu->arch.ctrl & 1)) mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); + /* + * When setting DEC, we must always deal with irq_work_raise via NMI vs + * setting DEC. The problem occurs right as we switch into guest mode + * if a NMI hits and sets pending work and sets DEC, then that will + * apply to the guest and not bring us back to the host. + * + * irq_work_raise could check a flag (or possibly LPCR[HDICE] for + * example) and set HDEC to 1? That wouldn't solve the nested hv + * case which needs to abort the hcall or zero the time limit. + * + * XXX: Another day's problem. + */ mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); if (kvmhv_on_pseries()) { @@ -3822,6 +3834,9 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->in_guest = 0; mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + /* We may have raced with new irq work */ + if (test_irq_work_pending()) + set_dec(1); mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); kvmhv_load_host_pmu(); -- 2.23.0