On Tue, Nov 28, 2017 at 09:34:56AM +0800, Longpeng (Mike) wrote: > > > On 2017/11/28 4:55, Michael S. Tsirkin wrote: > > > On Mon, Nov 27, 2017 at 09:51:27PM +0100, Paolo Bonzini wrote: > >> On 27/11/2017 21:45, Michael S. Tsirkin wrote: > >>> On Sat, Nov 25, 2017 at 02:09:32PM +0100, Jan H. Schönherr wrote: > >>>> If host CPUs are dedicated to a VM, we can avoid VM exits on HLT, > >>>> reducing the wake-up latency on posted interrupts. > >>>> > >>>> This reintroduces a feature that has been there at some point -- > >>>> see Linux 3.4 commit 10166744b80a ("KVM: VMX: remove yield_on_hlt") > >>>> for the removal -- but with the additional ability to enable it only > >>>> for selected VMs (and supporting SVM as well). > >>>> > >>>> Signed-off-by: Jan H. Schönherr <jschoenh@xxxxxxxxx> > >>> > >>> > >>> If you are going to do this, why not expose mwait > >>> in the cpuid thus making guests use mwait to halt? > >>> > >>> What are the advantages of doing this using halt specifically? > >> > >> Not all guests use MWAIT, I suppose. > >> > >> Paolo > > > > In that case, it would be nice to document which guests of interest > > don't. E.g. I don't think there are still supported versions of RHEL > > that don't use MWAIT. > > > > > Some old kernels, E.g. my kernel is 3.10.0-514 based on RHEL 7.3, don't use > MWAIT if the idle-driver is not supported (we can see > "/sys/devices/system/cpu/cpuidle/current_driver" in guest is "none"). This is exactly what I was saying. mwait is supported by this guest - the reason idle driver is not used is because QEMU does not enable the mwait flag in the CPUID. You just need a QEMU patch to expose CPUID. However, if mwait is disabled to avoid guest entering low PM states, then maybe it makes sense to allow a non-exiting HLT instead. Pls add that in the commit log as a motivation. > So the > idle routine will use the kernel's default routine. > > The default idle routine is selected when starting, > > old kernel: > ''' > select_idle_routine() > if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { > x86_idle = amd_e400_idle; > } else > x86_idle = default_idle; > ''' > > newer kernel: > ''' > select_idle_routine() > if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { > pr_info("using AMD E400 aware idle routine\n"); > x86_idle = amd_e400_idle; > } else if (prefer_mwait_c1_over_halt(c)) { > pr_info("using mwait in idle threads\n"); > x86_idle = mwait_idle; > } else > x86_idle = default_idle; > ''' > > So, some old guests don't use MWAIT as default idle routine. > > > > > > >>> > >>>> --- > >>>> Note: AMD code paths are only compile tested > >>>> --- > >>>> Documentation/virtual/kvm/api.txt | 12 +++++++++++- > >>>> arch/x86/include/asm/kvm_host.h | 1 + > >>>> arch/x86/kvm/svm.c | 3 ++- > >>>> arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++++++++------ > >>>> arch/x86/kvm/x86.c | 5 +++++ > >>>> arch/x86/kvm/x86.h | 5 +++++ > >>>> include/uapi/linux/kvm.h | 1 + > >>>> 7 files changed, 52 insertions(+), 8 deletions(-) > >>>> > >>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > >>>> index 0ee812c..c06bb41 100644 > >>>> --- a/Documentation/virtual/kvm/api.txt > >>>> +++ b/Documentation/virtual/kvm/api.txt > >>>> @@ -4172,7 +4172,17 @@ Returns: 0 on success > >>>> This capability indicates that a guest using memory monitoring instructions > >>>> (MWAIT/MWAITX) to stop a virtual CPU will not cause a VM exit. As such, time > >>>> spent while a virtual CPU is halted in this way will then be accounted for as > >>>> -guest running time on the host (as opposed to e.g. HLT). > >>>> +guest running time on the host. > >>>> + > >>>> +7.14 KVM_CAP_X86_GUEST_HLT > >>>> + > >>>> +Architectures: x86 > >>>> +Parameters: none > >>>> +Returns: 0 on success > >>>> + > >>>> +This capability indicates that a guest using HLT to stop a virtual CPU will not > >>>> +cause a VM exit. As such, time spent while a virtual CPU is halted in this way > >>>> +will then be accounted for as guest running time on the host. > >>>> > >>>> 8. Other capabilities. > >>>> ---------------------- > >>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > >>>> index f7bcfaa..3197c2d 100644 > >>>> --- a/arch/x86/include/asm/kvm_host.h > >>>> +++ b/arch/x86/include/asm/kvm_host.h > >>>> @@ -781,6 +781,7 @@ struct kvm_arch { > >>>> > >>>> gpa_t wall_clock; > >>>> > >>>> + bool hlt_in_guest; > >>>> bool mwait_in_guest; > >>>> > >>>> bool ept_identity_pagetable_done; > >>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > >>>> index ef1b320..c135b98 100644 > >>>> --- a/arch/x86/kvm/svm.c > >>>> +++ b/arch/x86/kvm/svm.c > >>>> @@ -1236,7 +1236,6 @@ static void init_vmcb(struct vcpu_svm *svm) > >>>> set_intercept(svm, INTERCEPT_RDPMC); > >>>> set_intercept(svm, INTERCEPT_CPUID); > >>>> set_intercept(svm, INTERCEPT_INVD); > >>>> - set_intercept(svm, INTERCEPT_HLT); > >>>> set_intercept(svm, INTERCEPT_INVLPG); > >>>> set_intercept(svm, INTERCEPT_INVLPGA); > >>>> set_intercept(svm, INTERCEPT_IOIO_PROT); > >>>> @@ -1257,6 +1256,8 @@ static void init_vmcb(struct vcpu_svm *svm) > >>>> set_intercept(svm, INTERCEPT_MONITOR); > >>>> set_intercept(svm, INTERCEPT_MWAIT); > >>>> } > >>>> + if (!kvm_hlt_in_guest(svm->vcpu.kvm)) > >>>> + set_intercept(svm, INTERCEPT_HLT); > >>>> > >>>> control->iopm_base_pa = __sme_set(iopm_base); > >>>> control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); > >>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > >>>> index a067735..1b67433 100644 > >>>> --- a/arch/x86/kvm/vmx.c > >>>> +++ b/arch/x86/kvm/vmx.c > >>>> @@ -2446,6 +2446,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) > >>>> vmx_set_interrupt_shadow(vcpu, 0); > >>>> } > >>>> > >>>> +static void vmx_set_intr_info(struct kvm_vcpu *vcpu, u32 intr) > >>>> +{ > >>>> + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); > >>>> + > >>>> + /* > >>>> + * Ensure that we clear the HLT state in the VMCS. We don't need to > >>>> + * explicitly skip the instruction because if the HLT state is set, then > >>>> + * the instruction is already executing and RIP has already been > >>>> + * advanced. > >>>> + */ > >>>> + if (!kvm_hlt_in_guest(vcpu->kvm) || !(intr & INTR_INFO_VALID_MASK)) > >>>> + return; > >>>> + if (is_external_interrupt(intr) || is_nmi(intr)) > >>>> + return; > >>>> + if (vmcs_read32(GUEST_ACTIVITY_STATE) != GUEST_ACTIVITY_HLT) > >>>> + return; > >>>> + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); > >>>> +} > >>>> + > >>>> static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, > >>>> unsigned long exit_qual) > >>>> { > >>>> @@ -2540,7 +2559,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) > >>>> } else > >>>> intr_info |= INTR_TYPE_HARD_EXCEPTION; > >>>> > >>>> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); > >>>> + vmx_set_intr_info(vcpu, intr_info); > >>>> } > >>>> > >>>> static bool vmx_rdtscp_supported(void) > >>>> @@ -5298,6 +5317,8 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) > >>>> if (kvm_mwait_in_guest(vmx->vcpu.kvm)) > >>>> exec_control &= ~(CPU_BASED_MWAIT_EXITING | > >>>> CPU_BASED_MONITOR_EXITING); > >>>> + if (kvm_hlt_in_guest(vmx->vcpu.kvm)) > >>>> + exec_control &= ~CPU_BASED_HLT_EXITING; > >>>> return exec_control; > >>>> } > >>>> > >>>> @@ -5635,7 +5656,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > >>>> > >>>> setup_msrs(vmx); > >>>> > >>>> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ > >>>> + vmx_set_intr_info(vcpu, 0); /* 22.2.1 */ > >>>> > >>>> if (cpu_has_vmx_tpr_shadow() && !init_event) { > >>>> vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); > >>>> @@ -5729,7 +5750,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) > >>>> vmx->vcpu.arch.event_exit_inst_len); > >>>> } else > >>>> intr |= INTR_TYPE_EXT_INTR; > >>>> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); > >>>> + vmx_set_intr_info(vcpu, intr); > >>>> } > >>>> > >>>> static void vmx_inject_nmi(struct kvm_vcpu *vcpu) > >>>> @@ -5758,8 +5779,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) > >>>> return; > >>>> } > >>>> > >>>> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, > >>>> - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); > >>>> + vmx_set_intr_info(vcpu, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | > >>>> + NMI_VECTOR); > >>>> } > >>>> > >>>> static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) > >>>> @@ -9301,7 +9322,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) > >>>> VM_ENTRY_INSTRUCTION_LEN, > >>>> VM_ENTRY_EXCEPTION_ERROR_CODE); > >>>> > >>>> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); > >>>> + vmx_set_intr_info(vcpu, 0); > >>>> } > >>>> > >>>> static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) > >>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > >>>> index fe6627a..f17c520 100644 > >>>> --- a/arch/x86/kvm/x86.c > >>>> +++ b/arch/x86/kvm/x86.c > >>>> @@ -2755,6 +2755,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) > >>>> case KVM_CAP_SET_BOOT_CPU_ID: > >>>> case KVM_CAP_SPLIT_IRQCHIP: > >>>> case KVM_CAP_IMMEDIATE_EXIT: > >>>> + case KVM_CAP_X86_GUEST_HLT: > >>>> r = 1; > >>>> break; > >>>> case KVM_CAP_ADJUST_CLOCK: > >>>> @@ -4068,6 +4069,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, > >>>> r = 0; > >>>> } > >>>> break; > >>>> + case KVM_CAP_X86_GUEST_HLT: > >>>> + kvm->arch.hlt_in_guest = true; > >>>> + r = 0; > >>>> + break; > >>>> default: > >>>> r = -EINVAL; > >>>> break; > >>>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > >>>> index ed8e150..b2066aa 100644 > >>>> --- a/arch/x86/kvm/x86.h > >>>> +++ b/arch/x86/kvm/x86.h > >>>> @@ -266,4 +266,9 @@ static inline bool kvm_mwait_in_guest(struct kvm *kvm) > >>>> return kvm->arch.mwait_in_guest; > >>>> } > >>>> > >>>> +static inline bool kvm_hlt_in_guest(struct kvm *kvm) > >>>> +{ > >>>> + return kvm->arch.hlt_in_guest; > >>>> +} > >>>> + > >>>> #endif > >>>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > >>>> index 282d7613..ff8f266 100644 > >>>> --- a/include/uapi/linux/kvm.h > >>>> +++ b/include/uapi/linux/kvm.h > >>>> @@ -932,6 +932,7 @@ struct kvm_ppc_resize_hpt { > >>>> #define KVM_CAP_HYPERV_SYNIC2 148 > >>>> #define KVM_CAP_HYPERV_VP_INDEX 149 > >>>> #define KVM_CAP_S390_AIS_MIGRATION 150 > >>>> +#define KVM_CAP_X86_GUEST_HLT 151 > >>>> > >>>> #ifdef KVM_CAP_IRQ_ROUTING > >>>> > >>>> -- > >>>> 2.3.1.dirty > > > > . > > > > > -- > Regards, > Longpeng(Mike)