On Mon, Nov 27, 2017 at 09:52:24PM +0100, Paolo Bonzini wrote: > On 27/11/2017 21:50, Michael S. Tsirkin wrote: > > On Sat, Nov 25, 2017 at 02:09:31PM +0100, Jan H. Schönherr wrote: > >> Allowing a guest to execute MWAIT without interception enables a guest > >> to put a (physical) CPU into a power saving state, where it takes > >> longer to return from than what may be desired by the host. > >> > >> Don't give a guest that power over a host by default. (Especially, > >> since nothing prevents a guest from using MWAIT even when it is not > >> advertised via CPUID.) > >> > >> This restores the behavior from before Linux 4.12 commit 668fffa3f838 > >> ("kvm: better MWAIT emulation for guests") but keeps the option to > >> enable MWAIT in guest for individual VMs. > >> > >> Suggested-by: KarimAllah Ahmed <karahmed@xxxxxxxxx> > >> Signed-off-by: Jan H. Schönherr <jschoenh@xxxxxxxxx> > > > > I don't think we should play with the defaults like this, userspace has > > no way to detect what is supported and what is the > > actual value. > > > > I think we should either leave the default alone, or remove this > > capability and add a new one. > > Are there any users of KVM_CHECK_EXTENSION(KVM_CAP_X86_GUEST_MWAIT)? Not that I know though there's no sure way to find out. > Because if not it's all academic... > > Paolo Presumably there will be. How will they know what is the actual default value? > > > > > > > >> --- > >> Note: AMD code paths are only compile tested > >> --- > >> Documentation/virtual/kvm/api.txt | 20 ++++++++++-------- > >> arch/x86/include/asm/kvm_host.h | 2 ++ > >> arch/x86/kvm/svm.c | 2 +- > >> arch/x86/kvm/vmx.c | 9 ++++---- > >> arch/x86/kvm/x86.c | 44 ++++++++++++++++++++++++++++++++++++++- > >> arch/x86/kvm/x86.h | 35 ++----------------------------- > >> 6 files changed, 64 insertions(+), 48 deletions(-) > >> > >> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > >> index f670e4b..0ee812c 100644 > >> --- a/Documentation/virtual/kvm/api.txt > >> +++ b/Documentation/virtual/kvm/api.txt > >> @@ -4163,6 +4163,17 @@ enables QEMU to build error log and branch to guest kernel registered > >> machine check handling routine. Without this capability KVM will > >> branch to guests' 0x200 interrupt vector. > >> > >> +7.13 KVM_CAP_X86_GUEST_MWAIT > >> + > >> +Architectures: x86 > >> +Parameters: none > >> +Returns: 0 on success > >> + > >> +This capability indicates that a guest using memory monitoring instructions > >> +(MWAIT/MWAITX) to stop a virtual CPU will not cause a VM exit. As such, time > >> +spent while a virtual CPU is halted in this way will then be accounted for as > >> +guest running time on the host (as opposed to e.g. HLT). > >> + > >> 8. Other capabilities. > >> ---------------------- > >> > >> @@ -4275,15 +4286,6 @@ reserved. > >> Both registers and addresses are 64-bits wide. > >> It will be possible to run 64-bit or 32-bit guest code. > >> > >> -8.8 KVM_CAP_X86_GUEST_MWAIT > >> - > >> -Architectures: x86 > >> - > >> -This capability indicates that guest using memory monotoring instructions > >> -(MWAIT/MWAITX) to stop the virtual CPU will not cause a VM exit. As such time > >> -spent while virtual CPU is halted in this way will then be accounted for as > >> -guest running time on the host (as opposed to e.g. HLT). > >> - > >> 8.9 KVM_CAP_ARM_USER_IRQ > >> > >> Architectures: arm, arm64 > >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > >> index b97726e..f7bcfaa 100644 > >> --- a/arch/x86/include/asm/kvm_host.h > >> +++ b/arch/x86/include/asm/kvm_host.h > >> @@ -781,6 +781,8 @@ struct kvm_arch { > >> > >> gpa_t wall_clock; > >> > >> + bool mwait_in_guest; > >> + > >> bool ept_identity_pagetable_done; > >> gpa_t ept_identity_map_addr; > >> > >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > >> index 1f3e7f2..ef1b320 100644 > >> --- a/arch/x86/kvm/svm.c > >> +++ b/arch/x86/kvm/svm.c > >> @@ -1253,7 +1253,7 @@ static void init_vmcb(struct vcpu_svm *svm) > >> set_intercept(svm, INTERCEPT_WBINVD); > >> set_intercept(svm, INTERCEPT_XSETBV); > >> > >> - if (!kvm_mwait_in_guest()) { > >> + if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { > >> set_intercept(svm, INTERCEPT_MONITOR); > >> set_intercept(svm, INTERCEPT_MWAIT); > >> } > >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > >> index 1eb7053..a067735 100644 > >> --- a/arch/x86/kvm/vmx.c > >> +++ b/arch/x86/kvm/vmx.c > >> @@ -3635,13 +3635,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) > >> CPU_BASED_USE_IO_BITMAPS | > >> CPU_BASED_MOV_DR_EXITING | > >> CPU_BASED_USE_TSC_OFFSETING | > >> + CPU_BASED_MWAIT_EXITING | > >> + CPU_BASED_MONITOR_EXITING | > >> CPU_BASED_INVLPG_EXITING | > >> CPU_BASED_RDPMC_EXITING; > >> > >> - if (!kvm_mwait_in_guest()) > >> - min |= CPU_BASED_MWAIT_EXITING | > >> - CPU_BASED_MONITOR_EXITING; > >> - > >> opt = CPU_BASED_TPR_SHADOW | > >> CPU_BASED_USE_MSR_BITMAPS | > >> CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; > >> @@ -5297,6 +5295,9 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) > >> exec_control |= CPU_BASED_CR3_STORE_EXITING | > >> CPU_BASED_CR3_LOAD_EXITING | > >> CPU_BASED_INVLPG_EXITING; > >> + if (kvm_mwait_in_guest(vmx->vcpu.kvm)) > >> + exec_control &= ~(CPU_BASED_MWAIT_EXITING | > >> + CPU_BASED_MONITOR_EXITING); > >> return exec_control; > >> } > >> > >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >> index 985a305..fe6627a 100644 > >> --- a/arch/x86/kvm/x86.c > >> +++ b/arch/x86/kvm/x86.c > >> @@ -67,6 +67,7 @@ > >> #include <asm/pvclock.h> > >> #include <asm/div64.h> > >> #include <asm/irq_remapping.h> > >> +#include <asm/mwait.h> > >> > >> #define CREATE_TRACE_POINTS > >> #include "trace.h" > >> @@ -2672,6 +2673,40 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, > >> return r; > >> } > >> > >> +static bool kvm_mwait_in_guest_possible(void) > >> +{ > >> + unsigned int eax, ebx, ecx, edx; > >> + > >> + if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) > >> + return false; > >> + > >> + switch (boot_cpu_data.x86_vendor) { > >> + case X86_VENDOR_AMD: > >> + /* All AMD CPUs have a working MWAIT implementation */ > >> + return true; > >> + case X86_VENDOR_INTEL: > >> + /* Handle Intel below */ > >> + break; > >> + default: > >> + return false; > >> + } > >> + > >> + /* > >> + * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as > >> + * they would allow guest to stop the CPU completely by disabling > >> + * interrupts then invoking MWAIT. > >> + */ > >> + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) > >> + return false; > >> + > >> + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); > >> + > >> + if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) > >> + return false; > >> + > >> + return true; > >> +} > >> + > >> int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) > >> { > >> int r; > >> @@ -2726,7 +2761,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) > >> r = KVM_CLOCK_TSC_STABLE; > >> break; > >> case KVM_CAP_X86_GUEST_MWAIT: > >> - r = kvm_mwait_in_guest(); > >> + r = kvm_mwait_in_guest_possible(); > >> break; > >> case KVM_CAP_X86_SMM: > >> /* SMBASE is usually relocated above 1M on modern chipsets, > >> @@ -4026,6 +4061,13 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, > >> > >> r = 0; > >> break; > >> + case KVM_CAP_X86_GUEST_MWAIT: > >> + r = -EINVAL; > >> + if (kvm_mwait_in_guest_possible()) { > >> + kvm->arch.mwait_in_guest = true; > >> + r = 0; > >> + } > >> + break; > >> default: > >> r = -EINVAL; > >> break; > >> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > >> index d0b95b7..ed8e150 100644 > >> --- a/arch/x86/kvm/x86.h > >> +++ b/arch/x86/kvm/x86.h > >> @@ -2,8 +2,6 @@ > >> #ifndef ARCH_X86_KVM_X86_H > >> #define ARCH_X86_KVM_X86_H > >> > >> -#include <asm/processor.h> > >> -#include <asm/mwait.h> > >> #include <linux/kvm_host.h> > >> #include <asm/pvclock.h> > >> #include "kvm_cache_regs.h" > >> @@ -263,38 +261,9 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) > >> __rem; \ > >> }) > >> > >> -static inline bool kvm_mwait_in_guest(void) > >> +static inline bool kvm_mwait_in_guest(struct kvm *kvm) > >> { > >> - unsigned int eax, ebx, ecx, edx; > >> - > >> - if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) > >> - return false; > >> - > >> - switch (boot_cpu_data.x86_vendor) { > >> - case X86_VENDOR_AMD: > >> - /* All AMD CPUs have a working MWAIT implementation */ > >> - return true; > >> - case X86_VENDOR_INTEL: > >> - /* Handle Intel below */ > >> - break; > >> - default: > >> - return false; > >> - } > >> - > >> - /* > >> - * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as > >> - * they would allow guest to stop the CPU completely by disabling > >> - * interrupts then invoking MWAIT. > >> - */ > >> - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) > >> - return false; > >> - > >> - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); > >> - > >> - if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) > >> - return false; > >> - > >> - return true; > >> + return kvm->arch.mwait_in_guest; > >> } > >> > >> #endif > >> -- > >> 2.3.1.dirty