On Fri, Dec 08, 2017 at 11:49:57AM +0100, Vitaly Kuznetsov wrote: > Hyper-V supports Live Migration notification. This is supposed to be used > in conjunction with TSC emulation: when we are migrated to a host with > different TSC frequency for some short period host emulates our accesses > to TSC and sends us an interrupt to notify about the event. When we're > done updating everything we can disable TSC emulation and everything will > start working fast again. > > We didn't need these notifications before as Hyper-V guests are not > supposed to use TSC as a clocksource: in Linux we even mark it as unstable > on boot. Guests normally use 'tsc page' clocksouce and host updates its > values on migrations automatically. > > Things change when we want to run nested virtualization: even when we pass > through PV clocksources (kvm-clock or tsc page) to our guests we need to > know TSC frequency and when it changes. > > Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies > EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The > right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that > the fix in on the way. > > Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> > --- > RFC -> v1: > - #include <asm/apic.h> [kbuild test robot] > - use div64_u64() [kbuild test robot] > - DECLARE_WORK -> DECLARE_DELAYED_WORK as testing showed there's some bug > in Hyper-V hypervisor and disabling emulation after receiving interrupt > may screw TSC counters. Looks kinda fragile... > --- > arch/x86/entry/entry_64.S | 4 +++ > arch/x86/hyperv/hv_init.c | 71 ++++++++++++++++++++++++++++++++++++++ > arch/x86/include/asm/entry_arch.h | 4 +++ > arch/x86/include/asm/hw_irq.h | 1 + > arch/x86/include/asm/irq_vectors.h | 7 +++- > arch/x86/include/asm/mshyperv.h | 8 +++++ > arch/x86/include/uapi/asm/hyperv.h | 27 +++++++++++++++ > arch/x86/kernel/idt.c | 3 ++ > 8 files changed, 124 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S > index f81d50d7ceac..a32730b260bc 100644 > --- a/arch/x86/entry/entry_64.S > +++ b/arch/x86/entry/entry_64.S > @@ -826,6 +826,10 @@ apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt > apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt > #endif > > +#if IS_ENABLED(CONFIG_HYPERV) > +apicinterrupt HYPERV_REENLIGHTENMENT_VECTOR hyperv_reenlightenment_intr smp_hyperv_reenlightenment_intr > +#endif > + > /* > * Exception entry points. > */ > diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c > index 1a6c63f721bc..bb62839ede81 100644 > --- a/arch/x86/hyperv/hv_init.c > +++ b/arch/x86/hyperv/hv_init.c > @@ -18,6 +18,7 @@ > */ > > #include <linux/types.h> > +#include <asm/apic.h> > #include <asm/hypervisor.h> > #include <asm/hyperv.h> > #include <asm/mshyperv.h> > @@ -102,6 +103,76 @@ static int hv_cpu_init(unsigned int cpu) > return 0; > } > > +static void (*hv_reenlightenment_cb)(void); > + > +static void hv_reenlightenment_notify(struct work_struct *dummy) > +{ > + if (hv_reenlightenment_cb) > + hv_reenlightenment_cb(); > +} > +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); > + > +void hyperv_stop_tsc_emulation(void) > +{ > + u64 freq; > + struct hv_tsc_emulation_status emu_status; > + > + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); > + emu_status.inprogress = 0; > + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); > + > + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); IIRC the availability of this msr is not guaranteed (I guess in reality it's present if the reenlightenment is supported, but I'd rather check). > + tsc_khz = div64_u64(freq, 1000); > +} > +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); > + > +void register_hv_tsc_update(void (*cb)(void)) > +{ The function name seems unfortunate. IMHO such a name suggests registering a callback on a notifier chain (rather than unconditionally replacing the old callback), and having no other side effects. > + struct hv_reenlightenment_control re_ctrl = { > + .vector = HYPERV_REENLIGHTENMENT_VECTOR, > + .enabled = 1, > + .target_vp = hv_vp_index[smp_processor_id()] > + }; > + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; > + > + if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)) > + return; What happens then? L2 guests keep running with their clocks ticking at a different speed? > + > + hv_reenlightenment_cb = cb; > + > + /* Make sure callback is registered before we write to MSRs */ > + wmb(); > + > + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); > + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); > +} > +EXPORT_SYMBOL_GPL(register_hv_tsc_update); > + > +void unregister_hv_tsc_update(void) > +{ > + struct hv_reenlightenment_control re_ctrl; > + > + if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)) > + return; > + > + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); > + re_ctrl.enabled = 0; > + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); > + > + hv_reenlightenment_cb = NULL; > +} > +EXPORT_SYMBOL_GPL(unregister_hv_tsc_update); > + > +asmlinkage __visible void > +__irq_entry smp_hyperv_reenlightenment_intr(struct pt_regs *regs) > +{ > + entering_ack_irq(); > + > + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); > + > + exiting_irq(); > +} > + > /* > * This function is to be invoked early in the boot sequence after the > * hypervisor has been detected. > diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h > index 416422762845..eb936cc49b62 100644 > --- a/arch/x86/include/asm/entry_arch.h > +++ b/arch/x86/include/asm/entry_arch.h > @@ -54,3 +54,7 @@ BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) > BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) > #endif > #endif > + > +#if IS_ENABLED(CONFIG_HYPERV) > +BUILD_INTERRUPT(hyperv_reenlightenment_intr, HYPERV_REENLIGHTENMENT_VECTOR) > +#endif > diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h > index 2851077b6051..c65193dac7d9 100644 > --- a/arch/x86/include/asm/hw_irq.h > +++ b/arch/x86/include/asm/hw_irq.h > @@ -36,6 +36,7 @@ extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); > extern asmlinkage void kvm_posted_intr_nested_ipi(void); > extern asmlinkage void error_interrupt(void); > extern asmlinkage void irq_work_interrupt(void); > +extern asmlinkage void hyperv_reenlightenment_intr(void); > > extern asmlinkage void spurious_interrupt(void); > extern asmlinkage void thermal_interrupt(void); > diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h > index 67421f649cfa..e71c1120426b 100644 > --- a/arch/x86/include/asm/irq_vectors.h > +++ b/arch/x86/include/asm/irq_vectors.h > @@ -103,7 +103,12 @@ > #endif > > #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef > -#define LOCAL_TIMER_VECTOR 0xee > + > +#if IS_ENABLED(CONFIG_HYPERV) > +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee > +#endif > + > +#define LOCAL_TIMER_VECTOR 0xed > > #define NR_VECTORS 256 > > diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h > index a0b34994f453..43164b097585 100644 > --- a/arch/x86/include/asm/mshyperv.h > +++ b/arch/x86/include/asm/mshyperv.h > @@ -314,11 +314,19 @@ void hyper_alloc_mmu(void); > void hyperv_report_panic(struct pt_regs *regs, long err); > bool hv_is_hypercall_page_setup(void); > void hyperv_cleanup(void); > + > +asmlinkage void smp_hyperv_reenlightenment_intr(struct pt_regs *regs); > +void register_hv_tsc_update(void (*cb)(void)); > +void unregister_hv_tsc_update(void); > +void hyperv_stop_tsc_emulation(void); > #else /* CONFIG_HYPERV */ > static inline void hyperv_init(void) {} > static inline bool hv_is_hypercall_page_setup(void) { return false; } > static inline void hyperv_cleanup(void) {} > static inline void hyperv_setup_mmu_ops(void) {} > +static inline void register_hv_tsc_update(void (*cb)(void)) {} > +static inline void unregister_hv_tsc_update(void) {} > +static inline void hyperv_stop_tsc_emulation(void) {}; > #endif /* CONFIG_HYPERV */ > > #ifdef CONFIG_HYPERV_TSCPAGE > diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h > index 1a5bfead93b4..197c2e6c7376 100644 > --- a/arch/x86/include/uapi/asm/hyperv.h > +++ b/arch/x86/include/uapi/asm/hyperv.h > @@ -40,6 +40,9 @@ > */ > #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) > > +/* AccessReenlightenmentControls privilege */ > +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) > + > /* > * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM > * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available > @@ -234,6 +237,30 @@ > #define HV_X64_MSR_CRASH_PARAMS \ > (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) > > +/* TSC emulation after migration */ > +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 > + > +struct hv_reenlightenment_control { > + u64 vector:8; > + u64 reserved1:8; > + u64 enabled:1; > + u64 reserved2:15; > + u64 target_vp:32; > +}; > + > +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 > +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 > + > +struct hv_tsc_emulation_control { > + u64 enabled:1; > + u64 reserved:63; > +}; > + > +struct hv_tsc_emulation_status { > + u64 inprogress:1; > + u64 reserved:63; > +}; > + > #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 > #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 > #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ > diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c > index d985cef3984f..5b8512d48aa3 100644 > --- a/arch/x86/kernel/idt.c > +++ b/arch/x86/kernel/idt.c > @@ -140,6 +140,9 @@ static const __initdata struct idt_data apic_idts[] = { > # ifdef CONFIG_IRQ_WORK > INTG(IRQ_WORK_VECTOR, irq_work_interrupt), > # endif > +#if IS_ENABLED(CONFIG_HYPERV) > + INTG(HYPERV_REENLIGHTENMENT_VECTOR, hyperv_reenlightenment_intr), > +#endif > INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), > INTG(ERROR_APIC_VECTOR, error_interrupt), > #endif Roman.