[PATCH 2/3] KVM: Add capability to not exit on HLT

Jan H. Schönherr <jschoenh@xxxxxxxxx> · Sat, 25 Nov 2017 14:09:32 +0100

If host CPUs are dedicated to a VM, we can avoid VM exits on HLT,
reducing the wake-up latency on posted interrupts.

This reintroduces a feature that has been there at some point --
see Linux 3.4 commit 10166744b80a ("KVM: VMX: remove yield_on_hlt")
for the removal -- but with the additional ability to enable it only
for selected VMs (and supporting SVM as well).

Signed-off-by: Jan H. Schönherr <jschoenh@xxxxxxxxx>
---
Note: AMD code paths are only compile tested
---
 Documentation/virtual/kvm/api.txt | 12 +++++++++++-
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/svm.c                |  3 ++-
 arch/x86/kvm/vmx.c                | 33 +++++++++++++++++++++++++++------
 arch/x86/kvm/x86.c                |  5 +++++
 arch/x86/kvm/x86.h                |  5 +++++
 include/uapi/linux/kvm.h          |  1 +
 7 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 0ee812c..c06bb41 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -4172,7 +4172,17 @@ Returns: 0 on success
 This capability indicates that a guest using memory monitoring instructions
 (MWAIT/MWAITX) to stop a virtual CPU will not cause a VM exit. As such, time
 spent while a virtual CPU is halted in this way will then be accounted for as
-guest running time on the host (as opposed to e.g. HLT).
+guest running time on the host.
+
+7.14 KVM_CAP_X86_GUEST_HLT
+
+Architectures: x86
+Parameters: none
+Returns: 0 on success
+
+This capability indicates that a guest using HLT to stop a virtual CPU will not
+cause a VM exit. As such, time spent while a virtual CPU is halted in this way
+will then be accounted for as guest running time on the host.
 
 8. Other capabilities.
 ----------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f7bcfaa..3197c2d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -781,6 +781,7 @@ struct kvm_arch {
 
 	gpa_t wall_clock;
 
+	bool hlt_in_guest;
 	bool mwait_in_guest;
 
 	bool ept_identity_pagetable_done;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef1b320..c135b98 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1236,7 +1236,6 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_intercept(svm, INTERCEPT_RDPMC);
 	set_intercept(svm, INTERCEPT_CPUID);
 	set_intercept(svm, INTERCEPT_INVD);
-	set_intercept(svm, INTERCEPT_HLT);
 	set_intercept(svm, INTERCEPT_INVLPG);
 	set_intercept(svm, INTERCEPT_INVLPGA);
 	set_intercept(svm, INTERCEPT_IOIO_PROT);
@@ -1257,6 +1256,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 		set_intercept(svm, INTERCEPT_MONITOR);
 		set_intercept(svm, INTERCEPT_MWAIT);
 	}
+	if (!kvm_hlt_in_guest(svm->vcpu.kvm))
+		set_intercept(svm, INTERCEPT_HLT);
 
 	control->iopm_base_pa = __sme_set(iopm_base);
 	control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a067735..1b67433 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2446,6 +2446,25 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	vmx_set_interrupt_shadow(vcpu, 0);
 }
 
+static void vmx_set_intr_info(struct kvm_vcpu *vcpu, u32 intr)
+{
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+
+	/*
+	 * Ensure that we clear the HLT state in the VMCS.  We don't need to
+	 * explicitly skip the instruction because if the HLT state is set, then
+	 * the instruction is already executing and RIP has already been
+	 * advanced.
+	 */
+	if (!kvm_hlt_in_guest(vcpu->kvm) || !(intr & INTR_INFO_VALID_MASK))
+		return;
+	if (is_external_interrupt(intr) || is_nmi(intr))
+		return;
+	if (vmcs_read32(GUEST_ACTIVITY_STATE) != GUEST_ACTIVITY_HLT)
+		return;
+	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
 					       unsigned long exit_qual)
 {
@@ -2540,7 +2559,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 	} else
 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
 
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+	vmx_set_intr_info(vcpu, intr_info);
 }
 
 static bool vmx_rdtscp_supported(void)
@@ -5298,6 +5317,8 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	if (kvm_mwait_in_guest(vmx->vcpu.kvm))
 		exec_control &= ~(CPU_BASED_MWAIT_EXITING |
 				  CPU_BASED_MONITOR_EXITING);
+	if (kvm_hlt_in_guest(vmx->vcpu.kvm))
+		exec_control &= ~CPU_BASED_HLT_EXITING;
 	return exec_control;
 }
 
@@ -5635,7 +5656,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	setup_msrs(vmx);
 
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
+	vmx_set_intr_info(vcpu, 0);  /* 22.2.1 */
 
 	if (cpu_has_vmx_tpr_shadow() && !init_event) {
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
@@ -5729,7 +5750,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 			     vmx->vcpu.arch.event_exit_inst_len);
 	} else
 		intr |= INTR_TYPE_EXT_INTR;
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+	vmx_set_intr_info(vcpu, intr);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -5758,8 +5779,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 		return;
 	}
 
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+	vmx_set_intr_info(vcpu, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK |
+				NMI_VECTOR);
 }
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -9301,7 +9322,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 				  VM_ENTRY_INSTRUCTION_LEN,
 				  VM_ENTRY_EXCEPTION_ERROR_CODE);
 
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+	vmx_set_intr_info(vcpu, 0);
 }
 
 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fe6627a..f17c520 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2755,6 +2755,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
 	case KVM_CAP_IMMEDIATE_EXIT:
+	case KVM_CAP_X86_GUEST_HLT:
 		r = 1;
 		break;
 	case KVM_CAP_ADJUST_CLOCK:
@@ -4068,6 +4069,10 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			r = 0;
 		}
 		break;
+	case KVM_CAP_X86_GUEST_HLT:
+		kvm->arch.hlt_in_guest = true;
+		r = 0;
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index ed8e150..b2066aa 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -266,4 +266,9 @@ static inline bool kvm_mwait_in_guest(struct kvm *kvm)
 	return kvm->arch.mwait_in_guest;
 }
 
+static inline bool kvm_hlt_in_guest(struct kvm *kvm)
+{
+	return kvm->arch.hlt_in_guest;
+}
+
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 282d7613..ff8f266 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -932,6 +932,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_HYPERV_SYNIC2 148
 #define KVM_CAP_HYPERV_VP_INDEX 149
 #define KVM_CAP_S390_AIS_MIGRATION 150
+#define KVM_CAP_X86_GUEST_HLT 151
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.3.1.dirty