[RFC PATCH V2 4/4] Utilize the vmx preemption timer for tsc deadline timer

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Yunhong Jiang <yunhong.jiang@xxxxxxxxx>

Utilizing the VMX preemption timer for tsc deadline timer
virtualization. The VMX preemption timer is armed when the vCPU is
running, and a VMExit will happen if the virtual TSC deadline timer
expires.

When the vCPU thread is scheduled out, the tsc deadline timer
virtualization will be switched to use the current solution, i.e. use
the timer for it. It's switched back to VMX preemption timer when the
vCPU thread is scheduled int.

This solution avoids the complex OS's hrtimer system, and also the host
timer interrupt handling cost, with a preemption_timer VMexit. It fits
well for some NFV usage scenario, when the vCPU is bound to a pCPU and
the pCPU is isolated, or some similar scenario.

However, it possibly has impact if the vCPU thread is scheduled in/out
very frequently, because it switches from/to the hrtimer emulation a lot.

Signed-off-by: Yunhong Jiang <yunhong.jiang@xxxxxxxxx>
---
 arch/x86/kvm/lapic.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 arch/x86/kvm/lapic.h | 11 ++++++
 arch/x86/kvm/trace.h | 22 ++++++++++++
 arch/x86/kvm/vmx.c   |  8 +++++
 arch/x86/kvm/x86.c   |  4 +++
 5 files changed, 139 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f1cf8a5ede11..93679db7ce0f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1313,7 +1313,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 		__delay(tsc_deadline - guest_tsc);
 }
 
-static void start_sw_tscdeadline(struct kvm_lapic *apic)
+static void start_sw_tscdeadline(struct kvm_lapic *apic, int no_expire)
 {
 	u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
 	u64 ns = 0;
@@ -1330,7 +1330,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 
 	now = apic->lapic_timer.timer.base->get_time();
 	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
-	if (likely(tscdeadline > guest_tsc)) {
+	if (no_expire || likely(tscdeadline > guest_tsc)) {
 		ns = (tscdeadline - guest_tsc) * 1000000ULL;
 		do_div(ns, this_tsc_khz);
 		expire = ktime_add_ns(now, ns);
@@ -1343,6 +1343,85 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
 	local_irq_restore(flags);
 }
 
+void kvm_lapic_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	u64 tscdeadline, guest_tsc;
+
+	if (apic->lapic_timer.hv_timer_state == HV_TIMER_NOT_USED)
+		return;
+
+	tscdeadline = apic->lapic_timer.tscdeadline;
+	guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+
+	if (tscdeadline >= guest_tsc)
+		kvm_x86_ops->set_hv_timer(vcpu, tscdeadline - guest_tsc);
+	else
+		kvm_x86_ops->set_hv_timer(vcpu, 0);
+
+	apic->lapic_timer.hv_timer_state = HV_TIMER_ARMED;
+	trace_kvm_hv_timer_state(vcpu->vcpu_id,
+			apic->lapic_timer.hv_timer_state);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_arm_hv_timer);
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	WARN_ON(apic->lapic_timer.hv_timer_state != HV_TIMER_ARMED);
+	WARN_ON(swait_active(&vcpu->wq));
+	kvm_x86_ops->cancel_hv_timer(vcpu);
+	apic->lapic_timer.hv_timer_state = HV_TIMER_NOT_USED;
+	apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+void switch_to_hv_lapic_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	WARN_ON(apic->lapic_timer.hv_timer_state != HV_TIMER_NOT_USED);
+
+	if (apic_lvtt_tscdeadline(apic) &&
+	    !atomic_read(&apic->lapic_timer.pending)) {
+		hrtimer_cancel(&apic->lapic_timer.timer);
+		/* In case the timer triggered in above small window */
+		if (!atomic_read(&apic->lapic_timer.pending)) {
+			apic->lapic_timer.hv_timer_state =
+				HV_TIMER_NEEDS_ARMING;
+			trace_kvm_hv_timer_state(vcpu->vcpu_id,
+					apic->lapic_timer.hv_timer_state);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(switch_to_hv_lapic_timer);
+
+void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	/* Possibly the TSC deadline timer is not enabled yet */
+	if (apic->lapic_timer.hv_timer_state == HV_TIMER_NOT_USED)
+		return;
+
+	if (apic->lapic_timer.hv_timer_state == HV_TIMER_ARMED)
+		kvm_x86_ops->cancel_hv_timer(vcpu);
+	apic->lapic_timer.hv_timer_state = HV_TIMER_NOT_USED;
+
+	if (atomic_read(&apic->lapic_timer.pending))
+		return;
+
+	/*
+	 * Don't trigger the apic_timer_expired() for deadlock,
+	 * because the swake_up() from apic_timer_expired() will
+	 * try to get the run queue lock, which has been held here
+	 * since we are in context switch procedure already.
+	 */
+	start_sw_tscdeadline(apic, 1);
+}
+EXPORT_SYMBOL_GPL(switch_to_sw_lapic_timer);
+
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now;
@@ -1389,7 +1468,19 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
 	} else if (apic_lvtt_tscdeadline(apic)) {
-		start_sw_tscdeadline(apic);
+		/* lapic timer in tsc deadline mode */
+		if (kvm_x86_ops->set_hv_timer) {
+			if (unlikely(!apic->lapic_timer.tscdeadline ||
+					!apic->vcpu->arch.virtual_tsc_khz))
+				return;
+
+			/* Expired timer will be checked on vcpu_run() */
+			apic->lapic_timer.hv_timer_state =
+				HV_TIMER_NEEDS_ARMING;
+			trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+					apic->lapic_timer.hv_timer_state);
+		} else
+			start_sw_tscdeadline(apic, 0);
 	}
 }
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 891c6da7d4aa..dc4fd8eea04d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -12,6 +12,12 @@
 #define KVM_APIC_SHORT_MASK	0xc0000
 #define KVM_APIC_DEST_MASK	0x800
 
+enum {
+	HV_TIMER_NOT_USED,
+	HV_TIMER_NEEDS_ARMING,
+	HV_TIMER_ARMED,
+};
+
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
@@ -20,6 +26,7 @@ struct kvm_timer {
 	u64 tscdeadline;
 	u64 expired_tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
+	int hv_timer_state;
 };
 
 struct kvm_lapic {
@@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu);
 int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
 			const unsigned long *bitmap, u32 bitmap_size);
+void switch_to_sw_lapic_timer(struct kvm_vcpu *vcpu);
+void switch_to_hv_lapic_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_arm_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
 #endif
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 8de925031b5c..8e1b5e9c2e78 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -6,6 +6,7 @@
 #include <asm/svm.h>
 #include <asm/clocksource.h>
 #include <asm/pvclock-abi.h>
+#include <lapic.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
@@ -1348,6 +1349,27 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
 		  __entry->vec)
 );
 
+#define kvm_trace_symbol_hv_timer		\
+	{HV_TIMER_NOT_USED, "no"},		\
+	{HV_TIMER_NEEDS_ARMING, "need_arming"},	\
+	{HV_TIMER_ARMED, "armed"}
+
+TRACE_EVENT(kvm_hv_timer_state,
+		TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_state),
+		TP_ARGS(vcpu_id, hv_timer_state),
+		TP_STRUCT__entry(
+			__field(unsigned int, vcpu_id)
+			__field(unsigned int, hv_timer_state)
+			),
+		TP_fast_assign(
+			__entry->vcpu_id = vcpu_id;
+			__entry->hv_timer_state = hv_timer_state;
+			),
+		TP_printk("vcpu_id %x hv_timer %s\n",
+			__entry->vcpu_id,
+			__print_symbolic(__entry->hv_timer_state,
+				kvm_trace_symbol_hv_timer))
+	   );
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2b29afa61715..dc0b8cae02b8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7576,6 +7576,11 @@ static int handle_pcommit(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+	kvm_lapic_expired_hv_timer(vcpu);
+	return 1;
+}
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7627,6 +7632,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
+	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -8678,6 +8684,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
+	kvm_lapic_arm_hv_timer(vcpu);
+
 	if (vmx->guest_pkru_valid)
 		__write_pkru(vmx->guest_pkru);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 269d576520ba..f4b50608568a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7724,11 +7724,15 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 
 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
 {
+	if (kvm_x86_ops->set_hv_timer)
+		switch_to_hv_lapic_timer(vcpu);
 	kvm_x86_ops->sched_in(vcpu, cpu);
 }
 
 void kvm_arch_sched_out(struct kvm_vcpu *vcpu)
 {
+	if (kvm_x86_ops->set_hv_timer)
+		switch_to_sw_lapic_timer(vcpu);
 }
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux