Re: [PATCH v5 untested] kvm: better MWAIT emulation for guests

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Thu, 16 Mar 2017 18:45:02 +0200

On Thu, Mar 16, 2017 at 12:16:13PM -0400, Gabriel L. Somlo wrote:
> On Thu, Mar 16, 2017 at 04:35:18PM +0100, Radim Krčmář wrote:
> > 2017-03-16 10:58-0400, Gabriel L. Somlo:
> > > On Thu, Mar 16, 2017 at 04:04:12PM +0200, Michael S. Tsirkin wrote:
> > > > On Thu, Mar 16, 2017 at 09:24:27AM -0400, Gabriel L. Somlo wrote:
> > > > > After studying your patch a bit more carefully (sorry, it's crazy
> > > > > around here right now :) ) I realized you're simply trying to
> > > > > (selectively) decide when to exit L1 and emulate as NOP vs. when to
> > > > > just allow L1 to execute MONITOR & MWAIT natively.
> > > > > 
> > > > > Is that right ? Because if so, the issues I saw on my MacPro1,1 are
> > > > > weird and inexplicable, given that allowing L>=1 to run MONITOR/MWAIT
> > > > > natively was one of the options Alex Graf and Rene Rebe used back in
> > > > > the very early days of OS X on QEMU, at the time I got involved with
> > > > > that project. Here's part of an out of tree patch against 3.4 which did
> > > > > just that, and worked as far as I remember on *any* MWAIT capable
> > > > > intel chip I had access to back in 2010:
> > > > > 
> > > > > ##############################################################################
> > > > > # 99-mwait.patch.kvm-kmod (Rene Rebe <rene@xxxxxxxxxxxx>) 2010-04-27
> > > > > ##############################################################################
> > > > > diff -pNarU5 linux-3.4/arch/x86/kvm/cpuid.c linux-3.4-mac/arch/x86/kvm/cpuid.c
> > > > > --- linux-3.4/arch/x86/kvm/cpuid.c	2012-05-20 18:29:13.000000000 -0400
> > > > > +++ linux-3.4-mac/arch/x86/kvm/cpuid.c	2012-10-09 11:42:59.921215750 -0400
> > > > > @@ -222,11 +222,11 @@ static int do_cpuid_ent(struct kvm_cpuid
> > > > >  		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
> > > > >  		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
> > > > >  		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
> > > > >  	/* cpuid 1.ecx */
> > > > >  	const u32 kvm_supported_word4_x86_features =
> > > > > -		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
> > > > > +		F(XMM3) | F(PCLMULQDQ) | F(MWAIT) /* DTES64, MONITOR */ |
> > > > >  		0 /* DS-CPL, VMX, SMX, EST */ |
> > > > >  		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
> > > > >  		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
> > > > >  		0 /* Reserved, DCA */ | F(XMM4_1) |
> > > > >  		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
> > > > > diff -pNarU5 linux-3.4/arch/x86/kvm/svm.c linux-3.4-mac/arch/x86/kvm/svm.c
> > > > > --- linux-3.4/arch/x86/kvm/svm.c	2012-05-20 18:29:13.000000000 -0400
> > > > > +++ linux-3.4-mac/arch/x86/kvm/svm.c	2012-10-09 11:44:41.598997481 -0400
> > > > > @@ -1102,12 +1102,10 @@ static void init_vmcb(struct vcpu_svm *s
> > > > >  	set_intercept(svm, INTERCEPT_VMSAVE);
> > > > >  	set_intercept(svm, INTERCEPT_STGI);
> > > > >  	set_intercept(svm, INTERCEPT_CLGI);
> > > > >  	set_intercept(svm, INTERCEPT_SKINIT);
> > > > >  	set_intercept(svm, INTERCEPT_WBINVD);
> > > > > -	set_intercept(svm, INTERCEPT_MONITOR);
> > > > > -	set_intercept(svm, INTERCEPT_MWAIT);
> > > > >  	set_intercept(svm, INTERCEPT_XSETBV);
> > > > >  
> > > > >  	control->iopm_base_pa = iopm_base;
> > > > >  	control->msrpm_base_pa = __pa(svm->msrpm);
> > > > >  	control->int_ctl = V_INTR_MASKING_MASK;
> > > > > diff -pNarU5 linux-3.4/arch/x86/kvm/vmx.c linux-3.4-mac/arch/x86/kvm/vmx.c
> > > > > --- linux-3.4/arch/x86/kvm/vmx.c	2012-05-20 18:29:13.000000000 -0400
> > > > > +++ linux-3.4-mac/arch/x86/kvm/vmx.c	2012-10-09 11:42:59.925215977 -0400
> > > > > @@ -1938,11 +1938,11 @@ static __init void nested_vmx_setup_ctls
> > > > >  		nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
> > > > >  	nested_vmx_procbased_ctls_low = 0;
> > > > >  	nested_vmx_procbased_ctls_high &=
> > > > >  		CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
> > > > >  		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
> > > > > -		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
> > > > > +		CPU_BASED_CR3_LOAD_EXITING |
> > > > >  		CPU_BASED_CR3_STORE_EXITING |
> > > > >  #ifdef CONFIG_X86_64
> > > > >  		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
> > > > >  #endif
> > > > >  		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
> > > > > @@ -2404,12 +2404,10 @@ static __init int setup_vmcs_config(stru
> > > > >  	      CPU_BASED_CR3_LOAD_EXITING |
> > > > >  	      CPU_BASED_CR3_STORE_EXITING |
> > > > >  	      CPU_BASED_USE_IO_BITMAPS |
> > > > >  	      CPU_BASED_MOV_DR_EXITING |
> > > > >  	      CPU_BASED_USE_TSC_OFFSETING |
> > > > > -	      CPU_BASED_MWAIT_EXITING |
> > > > > -	      CPU_BASED_MONITOR_EXITING |
> > > > >  	      CPU_BASED_INVLPG_EXITING |
> > > > >  	      CPU_BASED_RDPMC_EXITING;
> > > > >  
> > > > >  	opt = CPU_BASED_TPR_SHADOW |
> > > > >  	      CPU_BASED_USE_MSR_BITMAPS |
> > > > > 
> > > > > If all you're trying to do is (selectively) revert to this behavior,
> > > > > that "shouldn't" mess it up for the MacPro either, so I'm thoroughly
> > > > > confused at this point :)
> > > > 
> > > > Yes.  Me too. Want to try that other patch and see what happens?
> > > 
> > > You mean the old 3.4 patch against current KVM ? I'll try to do that,
> > > might take me a while :)
> > 
> > Michael's patch already did most of that, you just need to add
> > 
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> > index efde6cc50875..b12f07d4ce17 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
> >  	const u32 kvm_cpuid_1_ecx_x86_features =
> >  		/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
> >  		 * but *not* advertised to guests via CPUID ! */
> > -		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
> > +		F(XMM3) | F(PCLMULQDQ) | F(MWAIT) /* DTES64, MONITOR */ |
> >  		0 /* DS-CPL, VMX, SMX, EST */ |
> >  		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
> >  		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
> > 
> > Note: this will never be upstream, because mwait isn't what we want by
> > default. :)
> 
> But since OS X doesn't check CPUID and simply runs MONITOR & MWAIT
> assuming they're present, the above one-liner would make no
> difference. If everything else in the old patch I quoted is identical
> to what Michael does, then I don't know -- maybe the MacPro1,1 has
> really broken L>=1 MWAIT, and it only ever worked with vmexit and
> emulation on the host side.

I think I have an idea. It is probably one of the monitor bugs
on this host.

X86_BUG_CLFLUSH_MONITOR or X86_BUG_MONITOR.

If you tell guest you have a CPU that does not need it
but host does need it, then mwait will not work.

        if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
            (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
                set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);

        if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
                ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
                set_cpu_bug(c, X86_BUG_MONITOR);

what did you say your host model is?

> > >> > Back in 2010, running MWAIT in L>=1  behaved 100% exactly like a NOP,
> > >> > didn't power down the physical CPU, just immediately moved on to the
> > >> > next instruction. As such, there was no power saving and no
> > >> > opportunity to yield to another L0 thread either, unlike with NOP
> > >> > emulation at L0.
> > >> > 
> > >> > Did that change on newer Intel chips (i.e., is guest-mode MWAIT now
> > >> > doing something smarter than just acting as a guest-mode NOP) ?
> > >> > 
> > >> > Thanks,
> > >> > --Gabriel
> > >> 
> > >> Interesting.  What it seems to say is this:
> > >> 
> > >> MWAIT. Behavior of the MWAIT instruction (which always causes an invalid-
> > >> opcode exception—#UD—if CPL > 0) is determined by the setting of the “MWAIT
> > >> exiting” VM-execution control:
> > >> — If the “MWAIT exiting” VM-execution control is 1, MWAIT causes a VM exit
> > >> (see Section 22.1.3).
> > >> — If the “MWAIT exiting” VM-execution control is 0, MWAIT operates normally if
> > >> any of the following is true: (1) the “interrupt-window exiting” VM-execution
> > >> control is 0; (2) ECX[0] is 0; or (3) RFLAGS.IF = 1.
> > >> — If the “MWAIT exiting” VM-execution control is 0, the “interrupt-window
> > >> exiting” VM-execution control is 1, ECX[0] = 1, and RFLAGS.IF = 0, MWAIT
> > >> does not cause the processor to enter an implementation-dependent
> > >> optimized state; instead, control passes to the instruction following the
> > >> MWAIT instruction.
> > >> 
> > >> 
> > >> And since interrupt-window exiting is 0 most of the time for KVM,
> > >> I would expect MWAIT to behave normally.
> > > 
> > > The intel manual said the same thing back in 2010 as well. However,
> > > regardless of how any flags were set, interrupt-window exiting or not,
> > > "normal" L1 MWAIT behavior was that it woke up immediately regardless.
> > > Remember, never going to sleep is still correct ("normal" ?) behavior
> > > per the ISA definition of MWAIT :)
> > 
> > I'll write a simple kvm-unit-test to better understand why it is broken
> > for you ...
> > 
> > > Also, when I tested your patch on the macbook air (where it worked),
> > > not only was the host reporting 400% CPU for qemu (which is to be
> > > expected), but the thermal fan/cooling thing also shifted up into high
> > > gear, which means the physical CPU got hot, which it shouldn't have if
> > > the guest-mode MWAIT actually did put the host CPU into low power.
> > 
> > I tested MWAIT with basically the same kernel patch and the qemu patch
> > with Linux guest on Haswell and Nehalem.  Running the guest took 100% of
> > the host CPUs, but it still had the same temperature as when the host
> > was idle.
> > 
> > That reminds me that you to pass '-cpu host' for QEMU reasons.
> 
> For OS X to boot, one needs '-cpu core2duo' for <= 10.11, and
> '-cpu Penryn' for 10.12. I never managed to get it working with any
> other settings.
> 
> So I'm ready to write off the MacPro1,1 (unless you want me run more
> tests and report back for you, which I'm happy to do in any case).
> 
> But please please, so at least I walk away from this having learned
> something :) help me understand the use case:
> 
> 	- By careful setting of vmx flags, and/or on newer, sanely
> 	  built Intel hardware, L1 MWAIT actually powers down the
> 	  physical host core (while I couldn't get it to stay cool
> 	  on my end, I totally believe you managed to pull it off)
> 
> 	- We never admit to supporting MWAIT to guests, but when they
> 	  do anyway (either because they're old/grumpy/careless OS X
> 	  versions, or some newfangled custom-built Linux kernel which
> 	  is hacked to ignore CPUID on purpose), we now allow the
> 	  guest to:
> 		- keep its alloted time slice
> 		- but "waste" it by powering down the host CPU 
> 	  instead of
> 		- vmexit to the host OS at L0
> 		- yield the host core to another L0 runnable thread

NOP doesn't yield atomatically, does it? CPU stays runnable,
it just makes it a bit cheaper to switch to another thread
as you don't need to exit.

> Since newer OS X actually checks CPUID, I don't have a major stake in
> one way vs. the other, but I'm really really curious:
> 
> Are we trying to save power assuming the host is unlikely to have
> enough runnable L0 threads for when the L0-emulated NOP yields? So
> we're better off letting the guest keep the CPU but also keep it cool
> while at it (assuming the guest isn't totally hostile and didn't pick
> a setting where L1 MWAIT actually works as L1 NOP, in which case we
> don't even get to stay cool)?
> 
> Man, I wish I had the cycles to resurrect my attempt at acually emulating
> MWAIT with something like a condition queue (below, just for reference).
> 
> Thanks much,
> --Gabriel
> 
> 
> ##############################################################################
> # kvm-mwait-emu.patch (Gabriel Somlo <somlo@xxxxxxx> 2014/02/05)
> #   -- based on an idea suggested by Alex Graf --
> # GLS: emulate MONITOR and MWAIT at page-level granularity by write-protecting
> #       the page containing a monitored location and appropriately handling
> #       subsequent write faults.
> #       After debugging the SMP issue, we'll need a way to trigger a
> #       periodic cleanup that will switch write-protected monitored pages
> #       back to read-write, once they've stayed unused for "long enough"
> ##############################################################################
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fdf83af..7ca9b51 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -337,6 +337,16 @@ struct kvm_pmu {
>  	u64 reprogram_pmi;
>  };
>  
> +/*
> + * mwait-monitored page list element type
> + */
> +struct kvm_mwait_pg {
> +	gpa_t gpa;
> +	struct list_head vcpu_list; /* VCPUs monitoring (armed on) this page */
> +	struct list_head link;      /* links mwait-pages within a KVM */
> +	unsigned accessed;
> +};
> +
>  struct kvm_vcpu_arch {
>  	/*
>  	 * rip and regs accesses must go through
> @@ -528,6 +538,10 @@ struct kvm_vcpu_arch {
>  	struct {
>  		bool pv_unhalted;
>  	} pv;
> +
> +	/* MONITOR/MWAIT support */
> +	struct kvm_mwait_pg *mwp;	/* page monitored by this VCPU */
> +	struct list_head mw_link;	/* all VCPUs monitoring the same page */
>  };
>  
>  struct kvm_lpage_info {
> @@ -607,6 +621,10 @@ struct kvm_arch {
>  	u64 hv_hypercall;
>  	u64 hv_tsc_page;
>  
> +	/* MONITOR/MWAIT support */
> +	struct mutex mwait_lock;
> +	struct list_head mwait_pg_list;	/* monitored pages within this KVM */
> +
>  	#ifdef CONFIG_KVM_MMU_AUDIT
>  	int audit_point;
>  	#endif
> @@ -854,6 +872,8 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
>  void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
>  int kvm_emulate_halt(struct kvm_vcpu *vcpu);
>  int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
> +int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
> +int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
>  
>  void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
>  int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
> @@ -915,6 +935,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
>  		       const u8 *new, int bytes);
>  int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
>  int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
> +int kvm_mmu_protect_page(struct kvm *kvm, gfn_t gfn);
>  void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
>  int kvm_mmu_load(struct kvm_vcpu *vcpu);
>  void kvm_mmu_unload(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index c697625..7d4f1ca 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -279,6 +279,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
>  		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
>  	/* cpuid 1.ecx */
>  	const u32 kvm_supported_word4_x86_features =
> +		/* OS X does not check CPUID before using MONITOR/MWAIT from its
> +		 * power-optimized idle loop (AppleIntelPowerManagement.kext).
> +		 * For now, we don't advertise MWAIT support below, but attempt
> +		 * to emulate them instead of issuing an invalid opcode fault
> +		 * if a misbehaving guest calls them anyway. Removing the above
> +		 * mentioned kext from OS X will cause it to fall back to a
> +		 * HLT-based idle loop, as an optional guest optimization step.
> +		 */
>  		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
>  		0 /* DS-CPL, VMX, SMX, EST */ |
>  		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e50425d..bc02ebd 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2283,6 +2283,20 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
>  }
>  EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
>  
> +int kvm_mmu_protect_page(struct kvm *kvm, gfn_t gfn)
> +{
> +	int r;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	r = rmap_write_protect(kvm, gfn);
> +	if (r)
> +		kvm_flush_remote_tlbs(kvm);
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	return r;
> +}
> +EXPORT_SYMBOL_GPL(kvm_mmu_protect_page);
> +
>  /*
>   * The function is based on mtrr_type_lookup() in
>   * arch/x86/kernel/cpu/mtrr/generic.c
> @@ -4146,12 +4160,68 @@ static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
>  	return vcpu_match_mmio_gva(vcpu, addr);
>  }
>  
> +// try to handle fault caused by write to monitored (mwait) page
> +// FIXME: aim for better integration between this and FNAME(page_fault)() and
> +// kvm_mmu_page_fault() below. For now, this is proof-of-concept code.
> +static bool handle_mwait_write_fault(struct kvm_vcpu *vcpu, gva_t gva,
> +					void *in, int in_len)
> +{
> +	gpa_t gpa;
> +	struct kvm_mwait_pg *p, *mwp = NULL;
> +	struct kvm_vcpu_arch *v, *u;
> +	bool r = false;
> +
> +	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
> +	if (gpa == UNMAPPED_GVA)
> +		goto ul_out;
> +
> +	mutex_lock(&vcpu->kvm->arch.mwait_lock);
> +
> +	/* is gpa matching a monitored (mwait) page? */
> +	list_for_each_entry(p, &vcpu->kvm->arch.mwait_pg_list, link)
> +		if (p->gpa == gpa) {
> +			mwp = p;
> +			break;
> +		}
> +	if (mwp == NULL)
> +		goto out;
> +
> +	mwp->accessed = 1;
> +
> +	if (x86_emulate_instruction(vcpu, gva,
> +				    EMULTYPE_RETRY, in, in_len) != EMULATE_DONE)
> +		goto out;
> +
> +	/* disarm all VCPUs monitoring this page, waking them if needed */
> +	list_for_each_entry_safe(v, u, &mwp->vcpu_list, mw_link) {
> +		list_del(&v->mw_link);
> +		v->mwp = NULL;
> +		if (v->mp_state == KVM_MP_STATE_MWAIT)
> +			v->mp_state = KVM_MP_STATE_RUNNABLE;
> +	}
> +
> +	// What if the mwait is woken up by an interrupt instead of a write ?
> +	// It might remain "armed" on its old mwait page, but any subsequent
> +	// MONITOR instruction would replace that, so I don't think we need
> +	// to worry about it...
> +
> +	r = true;
> +out:
> +	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
> +ul_out:
> +	return r;
> +}
> +
>  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
>  		       void *insn, int insn_len)
>  {
>  	int r, emulation_type = EMULTYPE_RETRY;
>  	enum emulation_result er;
>  
> +	/* writing to MONITORed memory area ? */
> +	if (handle_mwait_write_fault(vcpu, cr2, insn, insn_len))
> +		return 1;
> +
>  	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
>  	if (r < 0)
>  		goto out;
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index e81df8f..638704c 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -3262,6 +3262,18 @@ static int pause_interception(struct vcpu_svm *svm)
>  	return 1;
>  }
>  
> +static int monitor_interception(struct vcpu_svm *svm)
> +{
> +	skip_emulated_instruction(&(svm->vcpu));
> +	return kvm_emulate_monitor(&(svm->vcpu));
> +}
> +
> +static int mwait_interception(struct vcpu_svm *svm)
> +{
> +	skip_emulated_instruction(&(svm->vcpu));
> +	return kvm_emulate_mwait(&(svm->vcpu));
> +}
> +
>  static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
>  	[SVM_EXIT_READ_CR0]			= cr_interception,
>  	[SVM_EXIT_READ_CR3]			= cr_interception,
> @@ -3319,8 +3331,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
>  	[SVM_EXIT_CLGI]				= clgi_interception,
>  	[SVM_EXIT_SKINIT]			= skinit_interception,
>  	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
> -	[SVM_EXIT_MONITOR]			= invalid_op_interception,
> -	[SVM_EXIT_MWAIT]			= invalid_op_interception,
> +	[SVM_EXIT_MONITOR]			= monitor_interception,
> +	[SVM_EXIT_MWAIT]			= mwait_interception,
>  	[SVM_EXIT_XSETBV]			= xsetbv_interception,
>  	[SVM_EXIT_NPF]				= pf_interception,
>  };
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index a06f101..a7382e1 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -5603,6 +5603,18 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
>  	return 1;
>  }
>  
> +static int handle_monitor(struct kvm_vcpu *vcpu)
> +{
> +	skip_emulated_instruction(vcpu);
> +	return kvm_emulate_monitor(vcpu);
> +}
> +
> +static int handle_mwait(struct kvm_vcpu *vcpu)
> +{
> +	skip_emulated_instruction(vcpu);
> +	return kvm_emulate_mwait(vcpu);
> +}
> +
>  /*
>   * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
>   * We could reuse a single VMCS for all the L2 guests, but we also want the
> @@ -6483,8 +6495,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
>  	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
>  	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
>  	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
> -	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_invalid_op,
> -	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
> +	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
> +	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
>  	[EXIT_REASON_INVEPT]                  = handle_invept,
>  };
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 39c28f09..8edc1be 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5592,6 +5592,70 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvm_emulate_halt);
>  
> +int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
> +{
> +	gva_t gva;
> +	gpa_t gpa;
> +	struct kvm_mwait_pg *p;
> +
> +	/* emulate as NOP if no-kvm-irqchip */
> +	if (!irqchip_in_kernel(vcpu->kvm))
> +		return 1;
> +
> +	mutex_lock(&vcpu->kvm->arch.mwait_lock);
> +
> +	/* relinguish any previously monitored mwait page */
> +	if (vcpu->arch.mwp != NULL) {
> +		list_del(&vcpu->arch.mw_link);
> +		vcpu->arch.mwp->accessed = 1;
> +		vcpu->arch.mwp = NULL;
> +	}
> +
> +	gva = kvm_register_read(vcpu, VCPU_REGS_RAX);
> +	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
> +	if (gpa == UNMAPPED_GVA)
> +		goto out;       /* let some write op map the page first */
> +
> +	/* does the mwait page we're looking for already exist? */
> +	list_for_each_entry(p, &vcpu->kvm->arch.mwait_pg_list, link)
> +		if (p->gpa == gpa) {
> +			vcpu->arch.mwp = p;
> +			break;
> +		}
> +	if (vcpu->arch.mwp == NULL) { /* no, add new mwait page */
> +		if (!kvm_mmu_protect_page(vcpu->kvm, gpa_to_gfn(gpa)))
> +			goto out;
> +		p = kmalloc(sizeof(struct kvm_mwait_pg), GFP_KERNEL);
> +		p->gpa = gpa;
> +		INIT_LIST_HEAD(&p->vcpu_list);
> +		list_add(&p->link, &vcpu->kvm->arch.mwait_pg_list);
> +
> +		vcpu->arch.mwp = p;
> +	}
> +
> +	/* link this VCPU into list of VCPUs monitoring this mwait page */
> +	list_add(&vcpu->arch.mw_link, &vcpu->arch.mwp->vcpu_list);
> +
> +out:
> +	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
> +	return 1;
> +}
> +EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
> +
> +int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
> +{
> +	/* emulate as NOP if no-kvm-irqchip */
> +	if (!irqchip_in_kernel(vcpu->kvm))
> +		return 1;
> +
> +	mutex_lock(&vcpu->kvm->arch.mwait_lock);
> +	if (vcpu->arch.mwp != NULL)
> +		vcpu->arch.mp_state = KVM_MP_STATE_MWAIT;
> +	mutex_unlock(&vcpu->kvm->arch.mwait_lock);
> +	return 1;
> +}
> +EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
> +
>  int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
>  {
>  	u64 param, ingpa, outgpa, ret;
> @@ -6077,6 +6141,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
>  			if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
>  				kvm_apic_accept_events(vcpu);
>  				switch(vcpu->arch.mp_state) {
> +				case KVM_MP_STATE_MWAIT:
>  				case KVM_MP_STATE_HALTED:
>  					vcpu->arch.pv.pv_unhalted = false;
>  					vcpu->arch.mp_state =
> @@ -6961,6 +7026,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>  	kvm_async_pf_hash_reset(vcpu);
>  	kvm_pmu_init(vcpu);
>  
> +	vcpu->arch.mwp = NULL;
> +
>  	return 0;
>  fail_free_wbinvd_dirty_mask:
>  	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
> @@ -7013,6 +7080,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>  
>  	pvclock_update_vm_gtod_copy(kvm);
>  
> +	mutex_init(&kvm->arch.mwait_lock);
> +	INIT_LIST_HEAD(&kvm->arch.mwait_pg_list);
> +
>  	return 0;
>  }
>  
> @@ -7254,8 +7324,10 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
>  		|| kvm_apic_has_events(vcpu)
>  		|| vcpu->arch.pv.pv_unhalted
>  		|| atomic_read(&vcpu->arch.nmi_queued) ||
> -		(kvm_arch_interrupt_allowed(vcpu) &&
> -		 kvm_cpu_has_interrupt(vcpu));
> +		(kvm_cpu_has_interrupt(vcpu) &&
> +		 (kvm_arch_interrupt_allowed(vcpu) ||
> +		  (vcpu->arch.mp_state == KVM_MP_STATE_MWAIT &&
> +		   kvm_register_read(vcpu, VCPU_REGS_RCX) & 0x01)));
>  }
>  
>  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 932d7f2..a4925fc 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -398,6 +398,7 @@ struct kvm_vapic_addr {
>  #define KVM_MP_STATE_INIT_RECEIVED     2
>  #define KVM_MP_STATE_HALTED            3
>  #define KVM_MP_STATE_SIPI_RECEIVED     4
> +#define KVM_MP_STATE_MWAIT             5
>  
>  struct kvm_mp_state {
>  	__u32 mp_state;