Re: [PATCH 2/3] KVM: Add capability to not exit on HLT

Wanpeng Li <kernellwp@xxxxxxxxx> · Mon, 27 Nov 2017 09:47:19 +0800

Cc Eduardo,
2017-11-27 9:32 GMT+08:00 Wanpeng Li <kernellwp@xxxxxxxxx>:
> 2017-11-25 21:09 GMT+08:00 Jan H. Schönherr <jschoenh@xxxxxxxxx>:
>> If host CPUs are dedicated to a VM, we can avoid VM exits on HLT,
>> reducing the wake-up latency on posted interrupts.
>>
>> This reintroduces a feature that has been there at some point --
>> see Linux 3.4 commit 10166744b80a ("KVM: VMX: remove yield_on_hlt")
>> for the removal -- but with the additional ability to enable it only
>> for selected VMs (and supporting SVM as well).
>>
>> Signed-off-by: Jan H. Schönherr <jschoenh@xxxxxxxxx>
>
> This patch results in guest hang during boot. Just the same when I
> tried to avoid VM exits on HLT several months ago.
> http://www.spinics.net/lists/kvm/msg152397.html

https://lkml.org/lkml/2017/11/6/863 The PV_DEDICATED will use the
qspinlock instead of the pvspinlock and the same scenario as "host
CPUs are dedicated to a VM". I think this can workaround the issue
which I mentioned.

>
> Regards,
> Wanpeng Li
>
>> ---
>> Note: AMD code paths are only compile tested
>> ---
>>  Documentation/virtual/kvm/api.txt | 12 +++++++++++-
>>  arch/x86/include/asm/kvm_host.h   |  1 +
>>  arch/x86/kvm/svm.c                |  3 ++-
>>  arch/x86/kvm/vmx.c                | 33 +++++++++++++++++++++++++++------
>>  arch/x86/kvm/x86.c                |  5 +++++
>>  arch/x86/kvm/x86.h                |  5 +++++
>>  include/uapi/linux/kvm.h          |  1 +
>>  7 files changed, 52 insertions(+), 8 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index 0ee812c..c06bb41 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -4172,7 +4172,17 @@ Returns: 0 on success
>>  This capability indicates that a guest using memory monitoring instructions
>>  (MWAIT/MWAITX) to stop a virtual CPU will not cause a VM exit. As such, time
>>  spent while a virtual CPU is halted in this way will then be accounted for as
>> -guest running time on the host (as opposed to e.g. HLT).
>> +guest running time on the host.
>> +
>> +7.14 KVM_CAP_X86_GUEST_HLT
>> +
>> +Architectures: x86
>> +Parameters: none
>> +Returns: 0 on success
>> +
>> +This capability indicates that a guest using HLT to stop a virtual CPU will not
>> +cause a VM exit. As such, time spent while a virtual CPU is halted in this way
>> +will then be accounted for as guest running time on the host.
>>
>>  8. Other capabilities.
>>  ----------------------
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index f7bcfaa..3197c2d 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -781,6 +781,7 @@ struct kvm_arch {
>>
>>         gpa_t wall_clock;
>>
>> +       bool hlt_in_guest;
>>         bool mwait_in_guest;
>>
>>         bool ept_identity_pagetable_done;
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index ef1b320..c135b98 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -1236,7 +1236,6 @@ static void init_vmcb(struct vcpu_svm *svm)
>>         set_intercept(svm, INTERCEPT_RDPMC);
>>         set_intercept(svm, INTERCEPT_CPUID);
>>         set_intercept(svm, INTERCEPT_INVD);
>> -       set_intercept(svm, INTERCEPT_HLT);
>>         set_intercept(svm, INTERCEPT_INVLPG);
>>         set_intercept(svm, INTERCEPT_INVLPGA);
>>         set_intercept(svm, INTERCEPT_IOIO_PROT);
>> @@ -1257,6 +1256,8 @@ static void init_vmcb(struct vcpu_svm *svm)
>>                 set_intercept(svm, INTERCEPT_MONITOR);
>>                 set_intercept(svm, INTERCEPT_MWAIT);
>>         }
>> +       if (!kvm_hlt_in_guest(svm->vcpu.kvm))
>> +               set_intercept(svm, INTERCEPT_HLT);
>>
>>         control->iopm_base_pa = __sme_set(iopm_base);
>>         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index a067735..1b67433 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -2446,6 +2446,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
>>         vmx_set_interrupt_shadow(vcpu, 0);
>>  }
>>
>> +static void vmx_set_intr_info(struct kvm_vcpu *vcpu, u32 intr)
>> +{
>> +       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
>> +
>> +       /*
>> +        * Ensure that we clear the HLT state in the VMCS.  We don't need to
>> +        * explicitly skip the instruction because if the HLT state is set, then
>> +        * the instruction is already executing and RIP has already been
>> +        * advanced.
>> +        */
>> +       if (!kvm_hlt_in_guest(vcpu->kvm) || !(intr & INTR_INFO_VALID_MASK))
>> +               return;
>> +       if (is_external_interrupt(intr) || is_nmi(intr))
>> +               return;
>> +       if (vmcs_read32(GUEST_ACTIVITY_STATE) != GUEST_ACTIVITY_HLT)
>> +               return;
>> +       vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
>> +}
>> +
>>  static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
>>                                                unsigned long exit_qual)
>>  {
>> @@ -2540,7 +2559,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
>>         } else
>>                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
>>
>> -       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
>> +       vmx_set_intr_info(vcpu, intr_info);
>>  }
>>
>>  static bool vmx_rdtscp_supported(void)
>> @@ -5298,6 +5317,8 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
>>         if (kvm_mwait_in_guest(vmx->vcpu.kvm))
>>                 exec_control &= ~(CPU_BASED_MWAIT_EXITING |
>>                                   CPU_BASED_MONITOR_EXITING);
>> +       if (kvm_hlt_in_guest(vmx->vcpu.kvm))
>> +               exec_control &= ~CPU_BASED_HLT_EXITING;
>>         return exec_control;
>>  }
>>
>> @@ -5635,7 +5656,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>>
>>         setup_msrs(vmx);
>>
>> -       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
>> +       vmx_set_intr_info(vcpu, 0);  /* 22.2.1 */
>>
>>         if (cpu_has_vmx_tpr_shadow() && !init_event) {
>>                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
>> @@ -5729,7 +5750,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
>>                              vmx->vcpu.arch.event_exit_inst_len);
>>         } else
>>                 intr |= INTR_TYPE_EXT_INTR;
>> -       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
>> +       vmx_set_intr_info(vcpu, intr);
>>  }
>>
>>  static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
>> @@ -5758,8 +5779,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
>>                 return;
>>         }
>>
>> -       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
>> -                       INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
>> +       vmx_set_intr_info(vcpu, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK |
>> +                               NMI_VECTOR);
>>  }
>>
>>  static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
>> @@ -9301,7 +9322,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
>>                                   VM_ENTRY_INSTRUCTION_LEN,
>>                                   VM_ENTRY_EXCEPTION_ERROR_CODE);
>>
>> -       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
>> +       vmx_set_intr_info(vcpu, 0);
>>  }
>>
>>  static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index fe6627a..f17c520 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2755,6 +2755,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>         case KVM_CAP_SET_BOOT_CPU_ID:
>>         case KVM_CAP_SPLIT_IRQCHIP:
>>         case KVM_CAP_IMMEDIATE_EXIT:
>> +       case KVM_CAP_X86_GUEST_HLT:
>>                 r = 1;
>>                 break;
>>         case KVM_CAP_ADJUST_CLOCK:
>> @@ -4068,6 +4069,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
>>                         r = 0;
>>                 }
>>                 break;
>> +       case KVM_CAP_X86_GUEST_HLT:
>> +               kvm->arch.hlt_in_guest = true;
>> +               r = 0;
>> +               break;
>>         default:
>>                 r = -EINVAL;
>>                 break;
>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>> index ed8e150..b2066aa 100644
>> --- a/arch/x86/kvm/x86.h
>> +++ b/arch/x86/kvm/x86.h
>> @@ -266,4 +266,9 @@ static inline bool kvm_mwait_in_guest(struct kvm *kvm)
>>         return kvm->arch.mwait_in_guest;
>>  }
>>
>> +static inline bool kvm_hlt_in_guest(struct kvm *kvm)
>> +{
>> +       return kvm->arch.hlt_in_guest;
>> +}
>> +
>>  #endif
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 282d7613..ff8f266 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -932,6 +932,7 @@ struct kvm_ppc_resize_hpt {
>>  #define KVM_CAP_HYPERV_SYNIC2 148
>>  #define KVM_CAP_HYPERV_VP_INDEX 149
>>  #define KVM_CAP_S390_AIS_MIGRATION 150
>> +#define KVM_CAP_X86_GUEST_HLT 151
>>
>>  #ifdef KVM_CAP_IRQ_ROUTING
>>
>> --
>> 2.3.1.dirty
>>