[RFC PATCH 4/5] Utilize the vmx preemption timer for tsc deadline timer

Yunhong Jiang <yunhong.jiang@xxxxxxxxxxxxxxx> · Thu, 19 May 2016 18:45:02 -0700

From: Yunhong Jiang <yunhong.jiang@xxxxxxxxx>

Utilizing the VMX preemption timer for tsc deadline timer
virtualization. The VMX preemption timer is armed when the vCPU is
running, and a VMExit will happen if the virtual TSC deadline timer
expires.

When the vCPU thread is scheduled out, the tsc deadline timer
virtualization will be switched to use the current solution, i.e. use
the timer for it. It's switched back to VMX preemption timer when the
vCPU thread is scheduled int.

This solution avoids the complex OS's hrtimer system, and also the host
timer interrupt handling cost, with a preemption_timer VMexit. It fits
well for some NFV usage scenario, when the vCPU is bound to a pCPU and
the pCPU is isolated, or some similar scenario.

However, it possibly has impact if the vCPU thread is scheduled in/out
very frequently, because it switches from/to the hrtimer emulation a lot.

Signed-off-by: Yunhong Jiang <yunhong.jiang@xxxxxxxxx>
---
 arch/x86/kvm/lapic.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/lapic.h |  10 +++++
 arch/x86/kvm/vmx.c   |  26 +++++++++++++
 arch/x86/kvm/x86.c   |   6 +++
 4 files changed, 147 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 12c416929d9c..c9e32bf1a613 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1320,7 +1320,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 		__delay(tsc_deadline - guest_tsc);
 }
 
-static void start_sw_tscdeadline(struct kvm_lapic *apic)
+static void start_sw_tscdeadline(struct kvm_lapic *apic, int no_expire)
 {
 	u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
 	u64 ns = 0;
@@ -1337,7 +1337,8 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 
 	now = apic->lapic_timer.timer.base->get_time();
 	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-	if (likely(tscdeadline > guest_tsc)) {
+	/* Not trigger the apic_timer if invoked from sched_out */
+	if (no_expire || likely(tscdeadline > guest_tsc)) {
 		ns = (tscdeadline - guest_tsc) * 1000000ULL;
 		do_div(ns, this_tsc_khz);
 		expire = ktime_add_ns(now, ns);
@@ -1396,9 +1397,110 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		start_sw_tscdeadline(apic);
+		/* lapic timer in tsc deadline mode */
+		if (hw_emul_timer(apic)) {
+			if (unlikely(!apic->lapic_timer.tscdeadline ||
+					!apic->vcpu->arch.virtual_tsc_khz))
+				return;
+
+			/* Expired timer will be checked on vcpu_run() */
+			apic->lapic_timer.hw_emulation = HWEMUL_ENABLED;
+		} else
+			start_sw_tscdeadline(apic, 0);
+	}
+}
+
+void switch_to_hw_lapic_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic->lapic_timer.hw_emulation)
+		return;
+
+	if (apic_lvtt_tscdeadline(apic) &&
+	    !atomic_read(&apic->lapic_timer.pending)) {
+		hrtimer_cancel(&apic->lapic_timer.timer);
+		/* In case the timer triggered in above small window */
+		if (!atomic_read(&apic->lapic_timer.pending))
+			apic->lapic_timer.hw_emulation = HWEMUL_ENABLED;
+	}
+}
+EXPORT_SYMBOL_GPL(switch_to_hw_lapic_timer);
+
+void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (!apic->lapic_timer.hw_emulation)
+		return;
+
+	if (apic->lapic_timer.hw_emulation == HWEMUL_INJECTED)
+		kvm_x86_ops->clear_hwemul_timer(vcpu);
+	apic->lapic_timer.hw_emulation = 0;
+
+	if (atomic_read(&apic->lapic_timer.pending))
+		return;
+
+	/* Don't trigger the apic_timer_expired() for deadlock */
+	start_sw_tscdeadline(apic, 1);
+}
+EXPORT_SYMBOL_GPL(switch_to_sw_lapic_timer);
+
+/*
+ * Check the hwemul timer status.
+ * -1: hwemul timer is not enabled
+ * >0: hwemul timer it not expired yet, the return is the delta tsc
+ *  0: hwemul timer expired already
+ */
+int check_apic_hwemul_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic->lapic_timer.hw_emulation) {
+		u64 tscdeadline = apic->lapic_timer.tscdeadline;
+		u64 guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+		if (tscdeadline <= guest_tsc)
+			return 0;
+		else
+			return (tscdeadline - guest_tsc);
+	}
+	return -1;
+}
+
+int inject_expired_hwemul_timer(struct kvm_vcpu *vcpu)
+{
+	if (!check_apic_hwemul_timer(vcpu)) {
+		struct kvm_lapic *apic = vcpu->arch.apic;
+
+		if (apic->lapic_timer.hw_emulation == HWEMUL_INJECTED)
+			kvm_x86_ops->clear_hwemul_timer(vcpu);
+		apic->lapic_timer.hw_emulation = 0;
+		atomic_inc(&apic->lapic_timer.pending);
+		kvm_set_pending_timer(vcpu);
+		return 1;
 	}
+
+	return 0;
+}
+
+int inject_pending_hwemul_timer(struct kvm_vcpu *vcpu)
+{
+	u64 hwemultsc;
+
+	hwemultsc = check_apic_hwemul_timer(vcpu);
+	/* Just before vmentry, so inject even if expired */
+	if (hwemultsc >= 0) {
+		struct kvm_lapic *apic = vcpu->arch.apic;
+
+		kvm_x86_ops->set_hwemul_timer(vcpu, hwemultsc);
+		apic->lapic_timer.hw_emulation = HWEMUL_INJECTED;
+		return 1;
+	}
+
+	return 0;
 }
+EXPORT_SYMBOL_GPL(inject_pending_hwemul_timer);
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
 {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 891c6da7d4aa..5037d7bf609a 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,10 @@
 #define KVM_APIC_SHORT_MASK	0xc0000
 #define KVM_APIC_DEST_MASK	0x800
 
+#define HWEMUL_ENABLED		1
+/* The VMCS has been set for the vmx preemption timer */
+#define HWEMUL_INJECTED		2
+
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
@@ -20,6 +24,7 @@ struct kvm_timer {
 	u64 tscdeadline;
 	u64 expired_tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
+	int hw_emulation;
 };
 
 struct kvm_lapic {
@@ -212,4 +217,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu);
 int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
 			const unsigned long *bitmap, u32 bitmap_size);
+void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu);
+void switch_to_hw_lapic_timer(struct kvm_vcpu *vcpu);
+int check_apic_hwemul_timer(struct kvm_vcpu *vcpu);
+int inject_expired_hwemul_timer(struct kvm_vcpu *vcpu);
+int inject_pending_hwemul_timer(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5475a7699ee5..f3659ab45b30 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7572,6 +7572,23 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic->lapic_timer.hw_emulation != HWEMUL_INJECTED)
+		printk(KERN_WARNING "Preemption timer w/o hwemulation\n");
+
+	if (!atomic_read(&apic->lapic_timer.pending)) {
+		atomic_inc(&apic->lapic_timer.pending);
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+	}
+
+	apic->lapic_timer.hw_emulation = 0;
+	vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+			PIN_BASED_VMX_PREEMPTION_TIMER);
+	return 1;
+}
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7623,6 +7640,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -8674,6 +8692,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
+	inject_pending_hwemul_timer(vcpu);
+
 	if (vmx->guest_pkru_valid)
 		__write_pkru(vmx->guest_pkru);
 
@@ -10693,10 +10713,16 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
 	if (ple_gap)
 		shrink_ple_window(vcpu);
+	if (vmx_hwemul_timer(vcpu))
+		switch_to_hw_lapic_timer(vcpu);
 }
 
 static void vmx_sched_out(struct kvm_vcpu *vcpu)
 {
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic->lapic_timer.hw_emulation)
+		switch_to_sw_lapic_timer(vcpu);
 }
 
 static void vmx_slot_enable_log_dirty(struct kvm *kvm,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5776473be362..a613bcfda59a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6608,6 +6608,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	local_irq_disable();
 
+	inject_expired_hwemul_timer(vcpu);
+
 	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
 	    || need_resched() || signal_pending(current)) {
 		vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -6773,6 +6775,10 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			break;
 
 		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+
+		/* Inject the hwemul timer if expired to avoid one VMExit */
+		inject_expired_hwemul_timer(vcpu);
+
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html