Cc Eduardo, 2017-11-27 9:32 GMT+08:00 Wanpeng Li <kernellwp@xxxxxxxxx>: > 2017-11-25 21:09 GMT+08:00 Jan H. Schönherr <jschoenh@xxxxxxxxx>: >> If host CPUs are dedicated to a VM, we can avoid VM exits on HLT, >> reducing the wake-up latency on posted interrupts. >> >> This reintroduces a feature that has been there at some point -- >> see Linux 3.4 commit 10166744b80a ("KVM: VMX: remove yield_on_hlt") >> for the removal -- but with the additional ability to enable it only >> for selected VMs (and supporting SVM as well). >> >> Signed-off-by: Jan H. Schönherr <jschoenh@xxxxxxxxx> > > This patch results in guest hang during boot. Just the same when I > tried to avoid VM exits on HLT several months ago. > http://www.spinics.net/lists/kvm/msg152397.html https://lkml.org/lkml/2017/11/6/863 The PV_DEDICATED will use the qspinlock instead of the pvspinlock and the same scenario as "host CPUs are dedicated to a VM". I think this can workaround the issue which I mentioned. > > Regards, > Wanpeng Li > >> --- >> Note: AMD code paths are only compile tested >> --- >> Documentation/virtual/kvm/api.txt | 12 +++++++++++- >> arch/x86/include/asm/kvm_host.h | 1 + >> arch/x86/kvm/svm.c | 3 ++- >> arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++++++++------ >> arch/x86/kvm/x86.c | 5 +++++ >> arch/x86/kvm/x86.h | 5 +++++ >> include/uapi/linux/kvm.h | 1 + >> 7 files changed, 52 insertions(+), 8 deletions(-) >> >> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt >> index 0ee812c..c06bb41 100644 >> --- a/Documentation/virtual/kvm/api.txt >> +++ b/Documentation/virtual/kvm/api.txt >> @@ -4172,7 +4172,17 @@ Returns: 0 on success >> This capability indicates that a guest using memory monitoring instructions >> (MWAIT/MWAITX) to stop a virtual CPU will not cause a VM exit. As such, time >> spent while a virtual CPU is halted in this way will then be accounted for as >> -guest running time on the host (as opposed to e.g. HLT). >> +guest running time on the host. >> + >> +7.14 KVM_CAP_X86_GUEST_HLT >> + >> +Architectures: x86 >> +Parameters: none >> +Returns: 0 on success >> + >> +This capability indicates that a guest using HLT to stop a virtual CPU will not >> +cause a VM exit. As such, time spent while a virtual CPU is halted in this way >> +will then be accounted for as guest running time on the host. >> >> 8. Other capabilities. >> ---------------------- >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h >> index f7bcfaa..3197c2d 100644 >> --- a/arch/x86/include/asm/kvm_host.h >> +++ b/arch/x86/include/asm/kvm_host.h >> @@ -781,6 +781,7 @@ struct kvm_arch { >> >> gpa_t wall_clock; >> >> + bool hlt_in_guest; >> bool mwait_in_guest; >> >> bool ept_identity_pagetable_done; >> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >> index ef1b320..c135b98 100644 >> --- a/arch/x86/kvm/svm.c >> +++ b/arch/x86/kvm/svm.c >> @@ -1236,7 +1236,6 @@ static void init_vmcb(struct vcpu_svm *svm) >> set_intercept(svm, INTERCEPT_RDPMC); >> set_intercept(svm, INTERCEPT_CPUID); >> set_intercept(svm, INTERCEPT_INVD); >> - set_intercept(svm, INTERCEPT_HLT); >> set_intercept(svm, INTERCEPT_INVLPG); >> set_intercept(svm, INTERCEPT_INVLPGA); >> set_intercept(svm, INTERCEPT_IOIO_PROT); >> @@ -1257,6 +1256,8 @@ static void init_vmcb(struct vcpu_svm *svm) >> set_intercept(svm, INTERCEPT_MONITOR); >> set_intercept(svm, INTERCEPT_MWAIT); >> } >> + if (!kvm_hlt_in_guest(svm->vcpu.kvm)) >> + set_intercept(svm, INTERCEPT_HLT); >> >> control->iopm_base_pa = __sme_set(iopm_base); >> control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index a067735..1b67433 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -2446,6 +2446,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) >> vmx_set_interrupt_shadow(vcpu, 0); >> } >> >> +static void vmx_set_intr_info(struct kvm_vcpu *vcpu, u32 intr) >> +{ >> + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); >> + >> + /* >> + * Ensure that we clear the HLT state in the VMCS. We don't need to >> + * explicitly skip the instruction because if the HLT state is set, then >> + * the instruction is already executing and RIP has already been >> + * advanced. >> + */ >> + if (!kvm_hlt_in_guest(vcpu->kvm) || !(intr & INTR_INFO_VALID_MASK)) >> + return; >> + if (is_external_interrupt(intr) || is_nmi(intr)) >> + return; >> + if (vmcs_read32(GUEST_ACTIVITY_STATE) != GUEST_ACTIVITY_HLT) >> + return; >> + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); >> +} >> + >> static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, >> unsigned long exit_qual) >> { >> @@ -2540,7 +2559,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) >> } else >> intr_info |= INTR_TYPE_HARD_EXCEPTION; >> >> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); >> + vmx_set_intr_info(vcpu, intr_info); >> } >> >> static bool vmx_rdtscp_supported(void) >> @@ -5298,6 +5317,8 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) >> if (kvm_mwait_in_guest(vmx->vcpu.kvm)) >> exec_control &= ~(CPU_BASED_MWAIT_EXITING | >> CPU_BASED_MONITOR_EXITING); >> + if (kvm_hlt_in_guest(vmx->vcpu.kvm)) >> + exec_control &= ~CPU_BASED_HLT_EXITING; >> return exec_control; >> } >> >> @@ -5635,7 +5656,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) >> >> setup_msrs(vmx); >> >> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ >> + vmx_set_intr_info(vcpu, 0); /* 22.2.1 */ >> >> if (cpu_has_vmx_tpr_shadow() && !init_event) { >> vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); >> @@ -5729,7 +5750,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) >> vmx->vcpu.arch.event_exit_inst_len); >> } else >> intr |= INTR_TYPE_EXT_INTR; >> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); >> + vmx_set_intr_info(vcpu, intr); >> } >> >> static void vmx_inject_nmi(struct kvm_vcpu *vcpu) >> @@ -5758,8 +5779,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) >> return; >> } >> >> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, >> - INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); >> + vmx_set_intr_info(vcpu, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | >> + NMI_VECTOR); >> } >> >> static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) >> @@ -9301,7 +9322,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) >> VM_ENTRY_INSTRUCTION_LEN, >> VM_ENTRY_EXCEPTION_ERROR_CODE); >> >> - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); >> + vmx_set_intr_info(vcpu, 0); >> } >> >> static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index fe6627a..f17c520 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -2755,6 +2755,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) >> case KVM_CAP_SET_BOOT_CPU_ID: >> case KVM_CAP_SPLIT_IRQCHIP: >> case KVM_CAP_IMMEDIATE_EXIT: >> + case KVM_CAP_X86_GUEST_HLT: >> r = 1; >> break; >> case KVM_CAP_ADJUST_CLOCK: >> @@ -4068,6 +4069,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, >> r = 0; >> } >> break; >> + case KVM_CAP_X86_GUEST_HLT: >> + kvm->arch.hlt_in_guest = true; >> + r = 0; >> + break; >> default: >> r = -EINVAL; >> break; >> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h >> index ed8e150..b2066aa 100644 >> --- a/arch/x86/kvm/x86.h >> +++ b/arch/x86/kvm/x86.h >> @@ -266,4 +266,9 @@ static inline bool kvm_mwait_in_guest(struct kvm *kvm) >> return kvm->arch.mwait_in_guest; >> } >> >> +static inline bool kvm_hlt_in_guest(struct kvm *kvm) >> +{ >> + return kvm->arch.hlt_in_guest; >> +} >> + >> #endif >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h >> index 282d7613..ff8f266 100644 >> --- a/include/uapi/linux/kvm.h >> +++ b/include/uapi/linux/kvm.h >> @@ -932,6 +932,7 @@ struct kvm_ppc_resize_hpt { >> #define KVM_CAP_HYPERV_SYNIC2 148 >> #define KVM_CAP_HYPERV_VP_INDEX 149 >> #define KVM_CAP_S390_AIS_MIGRATION 150 >> +#define KVM_CAP_X86_GUEST_HLT 151 >> >> #ifdef KVM_CAP_IRQ_ROUTING >> >> -- >> 2.3.1.dirty >>