Re: [PATCH 3/6] x86/hyper-v: reenlightenment notifications support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Dec 08, 2017 at 11:49:57AM +0100, Vitaly Kuznetsov wrote:
> Hyper-V supports Live Migration notification. This is supposed to be used
> in conjunction with TSC emulation: when we are migrated to a host with
> different TSC frequency for some short period host emulates our accesses
> to TSC and sends us an interrupt to notify about the event. When we're
> done updating everything we can disable TSC emulation and everything will
> start working fast again.
> 
> We didn't need these notifications before as Hyper-V guests are not
> supposed to use TSC as a clocksource: in Linux we even mark it as unstable
> on boot. Guests normally use 'tsc page' clocksouce and host updates its
> values on migrations automatically.
> 
> Things change when we want to run nested virtualization: even when we pass
> through PV clocksources (kvm-clock or tsc page) to our guests we need to
> know TSC frequency and when it changes.
> 
> Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies
> EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The
> right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that
> the fix in on the way.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
> ---
> RFC -> v1:
> - #include <asm/apic.h> [kbuild test robot]
> - use div64_u64() [kbuild test robot]
> - DECLARE_WORK -> DECLARE_DELAYED_WORK as testing showed there's some bug
>   in Hyper-V hypervisor and disabling emulation after receiving interrupt
>   may screw TSC counters.

Looks kinda fragile...

> ---
>  arch/x86/entry/entry_64.S          |  4 +++
>  arch/x86/hyperv/hv_init.c          | 71 ++++++++++++++++++++++++++++++++++++++
>  arch/x86/include/asm/entry_arch.h  |  4 +++
>  arch/x86/include/asm/hw_irq.h      |  1 +
>  arch/x86/include/asm/irq_vectors.h |  7 +++-
>  arch/x86/include/asm/mshyperv.h    |  8 +++++
>  arch/x86/include/uapi/asm/hyperv.h | 27 +++++++++++++++
>  arch/x86/kernel/idt.c              |  3 ++
>  8 files changed, 124 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index f81d50d7ceac..a32730b260bc 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -826,6 +826,10 @@ apicinterrupt SPURIOUS_APIC_VECTOR		spurious_interrupt		smp_spurious_interrupt
>  apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
>  #endif
>  
> +#if IS_ENABLED(CONFIG_HYPERV)
> +apicinterrupt HYPERV_REENLIGHTENMENT_VECTOR	hyperv_reenlightenment_intr	smp_hyperv_reenlightenment_intr
> +#endif
> +
>  /*
>   * Exception entry points.
>   */
> diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
> index 1a6c63f721bc..bb62839ede81 100644
> --- a/arch/x86/hyperv/hv_init.c
> +++ b/arch/x86/hyperv/hv_init.c
> @@ -18,6 +18,7 @@
>   */
>  
>  #include <linux/types.h>
> +#include <asm/apic.h>
>  #include <asm/hypervisor.h>
>  #include <asm/hyperv.h>
>  #include <asm/mshyperv.h>
> @@ -102,6 +103,76 @@ static int hv_cpu_init(unsigned int cpu)
>  	return 0;
>  }
>  
> +static void (*hv_reenlightenment_cb)(void);
> +
> +static void hv_reenlightenment_notify(struct work_struct *dummy)
> +{
> +	if (hv_reenlightenment_cb)
> +		hv_reenlightenment_cb();
> +}
> +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify);
> +
> +void hyperv_stop_tsc_emulation(void)
> +{
> +	u64 freq;
> +	struct hv_tsc_emulation_status emu_status;
> +
> +	rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
> +	emu_status.inprogress = 0;
> +	wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status);
> +
> +	rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq);

IIRC the availability of this msr is not guaranteed (I guess in reality
it's present if the reenlightenment is supported, but I'd rather check).

> +	tsc_khz = div64_u64(freq, 1000);
> +}
> +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
> +
> +void register_hv_tsc_update(void (*cb)(void))
> +{

The function name seems unfortunate.  IMHO such a name suggests
registering a callback on a notifier chain (rather than unconditionally
replacing the old callback), and having no other side effects.

> +	struct hv_reenlightenment_control re_ctrl = {
> +		.vector = HYPERV_REENLIGHTENMENT_VECTOR,
> +		.enabled = 1,
> +		.target_vp = hv_vp_index[smp_processor_id()]
> +	};
> +	struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1};
> +
> +	if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT))
> +		return;

What happens then?  L2 guests keep running with their clocks ticking at
a different speed?

> +
> +	hv_reenlightenment_cb = cb;
> +
> +	/* Make sure callback is registered before we write to MSRs */
> +	wmb();
> +
> +	wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl));
> +	wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl));
> +}
> +EXPORT_SYMBOL_GPL(register_hv_tsc_update);
> +
> +void unregister_hv_tsc_update(void)
> +{
> +	struct hv_reenlightenment_control re_ctrl;
> +
> +	if (!(ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT))
> +		return;
> +
> +	rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
> +	re_ctrl.enabled = 0;
> +	wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl);
> +
> +	hv_reenlightenment_cb = NULL;
> +}
> +EXPORT_SYMBOL_GPL(unregister_hv_tsc_update);
> +
> +asmlinkage __visible void
> +__irq_entry smp_hyperv_reenlightenment_intr(struct pt_regs *regs)
> +{
> +	entering_ack_irq();
> +
> +	schedule_delayed_work(&hv_reenlightenment_work, HZ/10);
> +
> +	exiting_irq();
> +}
> +
>  /*
>   * This function is to be invoked early in the boot sequence after the
>   * hypervisor has been detected.
> diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
> index 416422762845..eb936cc49b62 100644
> --- a/arch/x86/include/asm/entry_arch.h
> +++ b/arch/x86/include/asm/entry_arch.h
> @@ -54,3 +54,7 @@ BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
>  BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
>  #endif
>  #endif
> +
> +#if IS_ENABLED(CONFIG_HYPERV)
> +BUILD_INTERRUPT(hyperv_reenlightenment_intr, HYPERV_REENLIGHTENMENT_VECTOR)
> +#endif
> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
> index 2851077b6051..c65193dac7d9 100644
> --- a/arch/x86/include/asm/hw_irq.h
> +++ b/arch/x86/include/asm/hw_irq.h
> @@ -36,6 +36,7 @@ extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
>  extern asmlinkage void kvm_posted_intr_nested_ipi(void);
>  extern asmlinkage void error_interrupt(void);
>  extern asmlinkage void irq_work_interrupt(void);
> +extern asmlinkage void hyperv_reenlightenment_intr(void);
>  
>  extern asmlinkage void spurious_interrupt(void);
>  extern asmlinkage void thermal_interrupt(void);
> diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
> index 67421f649cfa..e71c1120426b 100644
> --- a/arch/x86/include/asm/irq_vectors.h
> +++ b/arch/x86/include/asm/irq_vectors.h
> @@ -103,7 +103,12 @@
>  #endif
>  
>  #define MANAGED_IRQ_SHUTDOWN_VECTOR	0xef
> -#define LOCAL_TIMER_VECTOR		0xee
> +
> +#if IS_ENABLED(CONFIG_HYPERV)
> +#define HYPERV_REENLIGHTENMENT_VECTOR	0xee
> +#endif
> +
> +#define LOCAL_TIMER_VECTOR		0xed
>  
>  #define NR_VECTORS			 256
>  
> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
> index a0b34994f453..43164b097585 100644
> --- a/arch/x86/include/asm/mshyperv.h
> +++ b/arch/x86/include/asm/mshyperv.h
> @@ -314,11 +314,19 @@ void hyper_alloc_mmu(void);
>  void hyperv_report_panic(struct pt_regs *regs, long err);
>  bool hv_is_hypercall_page_setup(void);
>  void hyperv_cleanup(void);
> +
> +asmlinkage void smp_hyperv_reenlightenment_intr(struct pt_regs *regs);
> +void register_hv_tsc_update(void (*cb)(void));
> +void unregister_hv_tsc_update(void);
> +void hyperv_stop_tsc_emulation(void);
>  #else /* CONFIG_HYPERV */
>  static inline void hyperv_init(void) {}
>  static inline bool hv_is_hypercall_page_setup(void) { return false; }
>  static inline void hyperv_cleanup(void) {}
>  static inline void hyperv_setup_mmu_ops(void) {}
> +static inline void register_hv_tsc_update(void (*cb)(void)) {}
> +static inline void unregister_hv_tsc_update(void) {}
> +static inline void hyperv_stop_tsc_emulation(void) {};
>  #endif /* CONFIG_HYPERV */
>  
>  #ifdef CONFIG_HYPERV_TSCPAGE
> diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
> index 1a5bfead93b4..197c2e6c7376 100644
> --- a/arch/x86/include/uapi/asm/hyperv.h
> +++ b/arch/x86/include/uapi/asm/hyperv.h
> @@ -40,6 +40,9 @@
>   */
>  #define HV_X64_ACCESS_FREQUENCY_MSRS		(1 << 11)
>  
> +/* AccessReenlightenmentControls privilege */
> +#define HV_X64_ACCESS_REENLIGHTENMENT		BIT(13)
> +
>  /*
>   * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
>   * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
> @@ -234,6 +237,30 @@
>  #define HV_X64_MSR_CRASH_PARAMS		\
>  		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
>  
> +/* TSC emulation after migration */
> +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL	0x40000106
> +
> +struct hv_reenlightenment_control {
> +	u64 vector:8;
> +	u64 reserved1:8;
> +	u64 enabled:1;
> +	u64 reserved2:15;
> +	u64 target_vp:32;
> +};
> +
> +#define HV_X64_MSR_TSC_EMULATION_CONTROL	0x40000107
> +#define HV_X64_MSR_TSC_EMULATION_STATUS		0x40000108
> +
> +struct hv_tsc_emulation_control {
> +	u64 enabled:1;
> +	u64 reserved:63;
> +};
> +
> +struct hv_tsc_emulation_status {
> +	u64 inprogress:1;
> +	u64 reserved:63;
> +};
> +
>  #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
>  #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
>  #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index d985cef3984f..5b8512d48aa3 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -140,6 +140,9 @@ static const __initdata struct idt_data apic_idts[] = {
>  # ifdef CONFIG_IRQ_WORK
>  	INTG(IRQ_WORK_VECTOR,		irq_work_interrupt),
>  # endif
> +#if IS_ENABLED(CONFIG_HYPERV)
> +	INTG(HYPERV_REENLIGHTENMENT_VECTOR, hyperv_reenlightenment_intr),
> +#endif
>  	INTG(SPURIOUS_APIC_VECTOR,	spurious_interrupt),
>  	INTG(ERROR_APIC_VECTOR,		error_interrupt),
>  #endif


Roman.



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux