Roman Kagan <rkagan@xxxxxxxxxxxxx> writes: > On Fri, Dec 08, 2017 at 11:49:57AM +0100, Vitaly Kuznetsov wrote: >> Hyper-V supports Live Migration notification. This is supposed to be used >> in conjunction with TSC emulation: when we are migrated to a host with >> different TSC frequency for some short period host emulates our accesses >> to TSC and sends us an interrupt to notify about the event. When we're >> done updating everything we can disable TSC emulation and everything will >> start working fast again. >> >> We didn't need these notifications before as Hyper-V guests are not >> supposed to use TSC as a clocksource: in Linux we even mark it as unstable >> on boot. Guests normally use 'tsc page' clocksouce and host updates its >> values on migrations automatically. >> >> Things change when we want to run nested virtualization: even when we pass >> through PV clocksources (kvm-clock or tsc page) to our guests we need to >> know TSC frequency and when it changes. >> >> Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies >> EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The >> right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that >> the fix in on the way. >> >> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx> >> --- >> RFC -> v1: >> - #include <asm/apic.h> [kbuild test robot] >> - use div64_u64() [kbuild test robot] >> - DECLARE_WORK -> DECLARE_DELAYED_WORK as testing showed there's some bug >> in Hyper-V hypervisor and disabling emulation after receiving interrupt >> may screw TSC counters. > > Looks kinda fragile... I believe this is temporary and Microsoft will fix things up host side. > >> --- >> arch/x86/entry/entry_64.S | 4 +++ >> arch/x86/hyperv/hv_init.c | 71 ++++++++++++++++++++++++++++++++++++++ >> arch/x86/include/asm/entry_arch.h | 4 +++ >> arch/x86/include/asm/hw_irq.h | 1 + >> arch/x86/include/asm/irq_vectors.h | 7 +++- >> arch/x86/include/asm/mshyperv.h | 8 +++++ >> arch/x86/include/uapi/asm/hyperv.h | 27 +++++++++++++++ >> arch/x86/kernel/idt.c | 3 ++ >> 8 files changed, 124 insertions(+), 1 deletion(-) >> >> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S >> index f81d50d7ceac..a32730b260bc 100644 >> --- a/arch/x86/entry/entry_64.S >> +++ b/arch/x86/entry/entry_64.S >> @@ -826,6 +826,10 @@ apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt >> apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt >> #endif >> >> +#if IS_ENABLED(CONFIG_HYPERV) >> +apicinterrupt HYPERV_REENLIGHTENMENT_VECTOR hyperv_reenlightenment_intr smp_hyperv_reenlightenment_intr >> +#endif >> + >> /* >> * Exception entry points. >> */ >> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c >> index 1a6c63f721bc..bb62839ede81 100644 >> --- a/arch/x86/hyperv/hv_init.c >> +++ b/arch/x86/hyperv/hv_init.c >> @@ -18,6 +18,7 @@ >> */ >> >> #include <linux/types.h> >> +#include <asm/apic.h> >> #include <asm/hypervisor.h> >> #include <asm/hyperv.h> >> #include <asm/mshyperv.h> >> @@ -102,6 +103,76 @@ static int hv_cpu_init(unsigned int cpu) >> return 0; >> } >> >> +static void (*hv_reenlightenment_cb)(void); >> + >> +static void hv_reenlightenment_notify(struct work_struct *dummy) >> +{ >> + if (hv_reenlightenment_cb) >> + hv_reenlightenment_cb(); >> +} >> +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); >> + >> +void hyperv_stop_tsc_emulation(void) >> +{ >> + u64 freq; >> + struct hv_tsc_emulation_status emu_status; >> + >> + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); >> + emu_status.inprogress = 0; >> + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); >> + >> + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); > > IIRC the availability of this msr is not guaranteed (I guess in reality > it's present if the reenlightenment is supported, but I'd rather check). > Will do. >> + tsc_khz = div64_u64(freq, 1000); >> +} >> +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); >> + >> +void register_hv_tsc_update(void (*cb)(void)) >> +{ > > The function name seems unfortunate. IMHO such a name suggests > registering a callback on a notifier chain (rather than unconditionally > replacing the old callback), and having no other side effects. I see, any suggestion? register_hv_reenlightenment_cb? register_hv_tscchange_cb? > >> + struct hv_reenlightenment_control re_ctrl = { >> + .vector = HYPERV_REENLIGHTENMENT_VECTOR, >> + .enabled = 1, >> + .target_vp = hv_vp_index[smp_processor_id()] >> + }; >> + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; >> + >> + if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)) >> + return; > > What happens then? L2 guests keep running with their clocks ticking at > a different speed? > In reallity this never happens -- in case nested virtualization is supported reenlightenment is also available. In theory, L0 can emulate TSC acceess for forever after migration. >> + >> + hv_reenlightenment_cb = cb; >> + >> + /* Make sure callback is registered before we write to MSRs */ >> + wmb(); >> + >> + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); >> + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); >> +} >> +EXPORT_SYMBOL_GPL(register_hv_tsc_update); >> + >> +void unregister_hv_tsc_update(void) >> +{ >> + struct hv_reenlightenment_control re_ctrl; >> + >> + if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)) >> + return; >> + >> + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); >> + re_ctrl.enabled = 0; >> + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); >> + >> + hv_reenlightenment_cb = NULL; >> +} >> +EXPORT_SYMBOL_GPL(unregister_hv_tsc_update); >> + >> +asmlinkage __visible void >> +__irq_entry smp_hyperv_reenlightenment_intr(struct pt_regs *regs) >> +{ >> + entering_ack_irq(); >> + >> + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); >> + >> + exiting_irq(); >> +} >> + >> /* >> * This function is to be invoked early in the boot sequence after the >> * hypervisor has been detected. >> diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h >> index 416422762845..eb936cc49b62 100644 >> --- a/arch/x86/include/asm/entry_arch.h >> +++ b/arch/x86/include/asm/entry_arch.h >> @@ -54,3 +54,7 @@ BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) >> BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR) >> #endif >> #endif >> + >> +#if IS_ENABLED(CONFIG_HYPERV) >> +BUILD_INTERRUPT(hyperv_reenlightenment_intr, HYPERV_REENLIGHTENMENT_VECTOR) >> +#endif >> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h >> index 2851077b6051..c65193dac7d9 100644 >> --- a/arch/x86/include/asm/hw_irq.h >> +++ b/arch/x86/include/asm/hw_irq.h >> @@ -36,6 +36,7 @@ extern asmlinkage void kvm_posted_intr_wakeup_ipi(void); >> extern asmlinkage void kvm_posted_intr_nested_ipi(void); >> extern asmlinkage void error_interrupt(void); >> extern asmlinkage void irq_work_interrupt(void); >> +extern asmlinkage void hyperv_reenlightenment_intr(void); >> >> extern asmlinkage void spurious_interrupt(void); >> extern asmlinkage void thermal_interrupt(void); >> diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h >> index 67421f649cfa..e71c1120426b 100644 >> --- a/arch/x86/include/asm/irq_vectors.h >> +++ b/arch/x86/include/asm/irq_vectors.h >> @@ -103,7 +103,12 @@ >> #endif >> >> #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef >> -#define LOCAL_TIMER_VECTOR 0xee >> + >> +#if IS_ENABLED(CONFIG_HYPERV) >> +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee >> +#endif >> + >> +#define LOCAL_TIMER_VECTOR 0xed >> >> #define NR_VECTORS 256 >> >> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h >> index a0b34994f453..43164b097585 100644 >> --- a/arch/x86/include/asm/mshyperv.h >> +++ b/arch/x86/include/asm/mshyperv.h >> @@ -314,11 +314,19 @@ void hyper_alloc_mmu(void); >> void hyperv_report_panic(struct pt_regs *regs, long err); >> bool hv_is_hypercall_page_setup(void); >> void hyperv_cleanup(void); >> + >> +asmlinkage void smp_hyperv_reenlightenment_intr(struct pt_regs *regs); >> +void register_hv_tsc_update(void (*cb)(void)); >> +void unregister_hv_tsc_update(void); >> +void hyperv_stop_tsc_emulation(void); >> #else /* CONFIG_HYPERV */ >> static inline void hyperv_init(void) {} >> static inline bool hv_is_hypercall_page_setup(void) { return false; } >> static inline void hyperv_cleanup(void) {} >> static inline void hyperv_setup_mmu_ops(void) {} >> +static inline void register_hv_tsc_update(void (*cb)(void)) {} >> +static inline void unregister_hv_tsc_update(void) {} >> +static inline void hyperv_stop_tsc_emulation(void) {}; >> #endif /* CONFIG_HYPERV */ >> >> #ifdef CONFIG_HYPERV_TSCPAGE >> diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h >> index 1a5bfead93b4..197c2e6c7376 100644 >> --- a/arch/x86/include/uapi/asm/hyperv.h >> +++ b/arch/x86/include/uapi/asm/hyperv.h >> @@ -40,6 +40,9 @@ >> */ >> #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) >> >> +/* AccessReenlightenmentControls privilege */ >> +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) >> + >> /* >> * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM >> * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available >> @@ -234,6 +237,30 @@ >> #define HV_X64_MSR_CRASH_PARAMS \ >> (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) >> >> +/* TSC emulation after migration */ >> +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 >> + >> +struct hv_reenlightenment_control { >> + u64 vector:8; >> + u64 reserved1:8; >> + u64 enabled:1; >> + u64 reserved2:15; >> + u64 target_vp:32; >> +}; >> + >> +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 >> +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 >> + >> +struct hv_tsc_emulation_control { >> + u64 enabled:1; >> + u64 reserved:63; >> +}; >> + >> +struct hv_tsc_emulation_status { >> + u64 inprogress:1; >> + u64 reserved:63; >> +}; >> + >> #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 >> #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 >> #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ >> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c >> index d985cef3984f..5b8512d48aa3 100644 >> --- a/arch/x86/kernel/idt.c >> +++ b/arch/x86/kernel/idt.c >> @@ -140,6 +140,9 @@ static const __initdata struct idt_data apic_idts[] = { >> # ifdef CONFIG_IRQ_WORK >> INTG(IRQ_WORK_VECTOR, irq_work_interrupt), >> # endif >> +#if IS_ENABLED(CONFIG_HYPERV) >> + INTG(HYPERV_REENLIGHTENMENT_VECTOR, hyperv_reenlightenment_intr), >> +#endif >> INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), >> INTG(ERROR_APIC_VECTOR, error_interrupt), >> #endif > > Roman. -- Vitaly