Allowing a guest to execute MWAIT without interception enables a guest to put a (physical) CPU into a power saving state, where it takes longer to return from than what may be desired by the host. Don't give a guest that power over a host by default. (Especially, since nothing prevents a guest from using MWAIT even when it is not advertised via CPUID.) This restores the behavior from before Linux 4.12 commit 668fffa3f838 ("kvm: better MWAIT emulation for guests") but keeps the option to enable MWAIT in guest for individual VMs. Suggested-by: KarimAllah Ahmed <karahmed@xxxxxxxxx> Signed-off-by: Jan H. Schönherr <jschoenh@xxxxxxxxx> --- Note: AMD code paths are only compile tested --- Documentation/virtual/kvm/api.txt | 20 ++++++++++-------- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 9 ++++---- arch/x86/kvm/x86.c | 44 ++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.h | 35 ++----------------------------- 6 files changed, 64 insertions(+), 48 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f670e4b..0ee812c 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4163,6 +4163,17 @@ enables QEMU to build error log and branch to guest kernel registered machine check handling routine. Without this capability KVM will branch to guests' 0x200 interrupt vector. +7.13 KVM_CAP_X86_GUEST_MWAIT + +Architectures: x86 +Parameters: none +Returns: 0 on success + +This capability indicates that a guest using memory monitoring instructions +(MWAIT/MWAITX) to stop a virtual CPU will not cause a VM exit. As such, time +spent while a virtual CPU is halted in this way will then be accounted for as +guest running time on the host (as opposed to e.g. HLT). + 8. Other capabilities. ---------------------- @@ -4275,15 +4286,6 @@ reserved. Both registers and addresses are 64-bits wide. It will be possible to run 64-bit or 32-bit guest code. -8.8 KVM_CAP_X86_GUEST_MWAIT - -Architectures: x86 - -This capability indicates that guest using memory monotoring instructions -(MWAIT/MWAITX) to stop the virtual CPU will not cause a VM exit. As such time -spent while virtual CPU is halted in this way will then be accounted for as -guest running time on the host (as opposed to e.g. HLT). - 8.9 KVM_CAP_ARM_USER_IRQ Architectures: arm, arm64 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b97726e..f7bcfaa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -781,6 +781,8 @@ struct kvm_arch { gpa_t wall_clock; + bool mwait_in_guest; + bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1f3e7f2..ef1b320 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1253,7 +1253,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_WBINVD); set_intercept(svm, INTERCEPT_XSETBV); - if (!kvm_mwait_in_guest()) { + if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { set_intercept(svm, INTERCEPT_MONITOR); set_intercept(svm, INTERCEPT_MWAIT); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1eb7053..a067735 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3635,13 +3635,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; - if (!kvm_mwait_in_guest()) - min |= CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING; - opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -5297,6 +5295,9 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) exec_control |= CPU_BASED_CR3_STORE_EXITING | CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_INVLPG_EXITING; + if (kvm_mwait_in_guest(vmx->vcpu.kvm)) + exec_control &= ~(CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING); return exec_control; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 985a305..fe6627a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -67,6 +67,7 @@ #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> +#include <asm/mwait.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -2672,6 +2673,40 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, return r; } +static bool kvm_mwait_in_guest_possible(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) + return false; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* All AMD CPUs have a working MWAIT implementation */ + return true; + case X86_VENDOR_INTEL: + /* Handle Intel below */ + break; + default: + return false; + } + + /* + * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as + * they would allow guest to stop the CPU completely by disabling + * interrupts then invoking MWAIT. + */ + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) + return false; + + return true; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; @@ -2726,7 +2761,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_GUEST_MWAIT: - r = kvm_mwait_in_guest(); + r = kvm_mwait_in_guest_possible(); break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, @@ -4026,6 +4061,13 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = 0; break; + case KVM_CAP_X86_GUEST_MWAIT: + r = -EINVAL; + if (kvm_mwait_in_guest_possible()) { + kvm->arch.mwait_in_guest = true; + r = 0; + } + break; default: r = -EINVAL; break; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d0b95b7..ed8e150 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,8 +2,6 @@ #ifndef ARCH_X86_KVM_X86_H #define ARCH_X86_KVM_X86_H -#include <asm/processor.h> -#include <asm/mwait.h> #include <linux/kvm_host.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" @@ -263,38 +261,9 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) -static inline bool kvm_mwait_in_guest(void) +static inline bool kvm_mwait_in_guest(struct kvm *kvm) { - unsigned int eax, ebx, ecx, edx; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) - return false; - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - /* All AMD CPUs have a working MWAIT implementation */ - return true; - case X86_VENDOR_INTEL: - /* Handle Intel below */ - break; - default: - return false; - } - - /* - * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as - * they would allow guest to stop the CPU completely by disabling - * interrupts then invoking MWAIT. - */ - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) - return false; - - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); - - if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) - return false; - - return true; + return kvm->arch.mwait_in_guest; } #endif -- 2.3.1.dirty