[PATCH 3/3] x86/kvm: implement Hyper-V reference TSC page clock

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hyper-V's paravirtualized clock -- reference TSC page -- is similar to
kvm_clock with a few important differences:

 - it uses slightly different arithmetic expression for the time in the
   guest, and therefore needs different values to be stored in the guest
   memory

 - the sequence number doesn't have the seqlock semantics; however, its
   value of zero indicates that the clock is in invalid state and the
   guest should proceed to alternative clock sources.  This is used to
   make the guest skip this clock source when it's being updated by the
   host [suggested by Paolo Bonzini].

 - there's a single reference TSC page per VM, therefore the update of
   its contents is bound to VCPU #0 only

In all other respects the clocks are similar, so implement Hyper-V
reference TSC page by updating its contents at the same point and using
the same data as that for kvm_clock.

Based on the patches and ideas by Andrey Smetanin and Paolo Bonzini.

Signed-off-by: Roman Kagan <rkagan@xxxxxxxxxxxxx>
---
Note: the new clocksource demonstrates a drift of ~15 ppm on my test
machine; it's identical to the drift of kvm_clock, and is a bug in the
calculation of kvm_clock parameters which I'm still chasing, and which
affects both clocks.

 arch/x86/kvm/hyperv.c | 123 ++++++++++++++++++++++++++++++++++++++++++++------
 arch/x86/kvm/hyperv.h |   3 ++
 arch/x86/kvm/trace.h  |  25 ++++++++++
 arch/x86/kvm/x86.c    |   7 ++-
 4 files changed, 141 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 2641907..dd453a7 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <asm/apicdef.h>
 #include <trace/events/kvm.h>
+#include <asm/pvclock.h>
 
 #include "trace.h"
 
@@ -797,23 +798,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 		mark_page_dirty(kvm, gfn);
 		break;
 	}
-	case HV_X64_MSR_REFERENCE_TSC: {
-		u64 gfn;
-		HV_REFERENCE_TSC_PAGE tsc_ref;
-
-		memset(&tsc_ref, 0, sizeof(tsc_ref));
+	case HV_X64_MSR_REFERENCE_TSC:
 		hv->hv_tsc_page = data;
-		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-			break;
-		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		if (kvm_write_guest(
-				kvm,
-				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-				&tsc_ref, sizeof(tsc_ref)))
-			return 1;
-		mark_page_dirty(kvm, gfn);
+		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 		break;
-	}
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 		return kvm_hv_msr_set_crash_data(vcpu,
 						 msr - HV_X64_MSR_CRASH_P0,
@@ -1143,3 +1132,107 @@ set_result:
 	kvm_hv_hypercall_set_result(vcpu, ret);
 	return 1;
 }
+
+bool kvm_hv_tscpage_need_update(struct kvm_vcpu *v)
+{
+	return v == kvm_get_vcpu(v->kvm, 0) &&
+		(v->kvm->arch.hyperv.hv_tsc_page &
+		 HV_X64_MSR_TSC_REFERENCE_ENABLE);
+}
+
+static int pvclock_to_tscpage(struct pvclock_vcpu_time_info *hv_clock,
+			       HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+#ifdef CONFIG_X86_64
+	/*
+	 * kvm_clock:
+	 * nsec = ((((rdtsc - tsc_timestamp) << tsc_shift) *
+	 *          tsc_to_system_mul) >> 32) + system_time
+	 *
+	 * hyperv ref tsc page:
+	 * nsec / 100 = ((rdtsc * tsc_scale) >> 64) + tsc_offset
+	 *
+	 * this gives:
+	 *
+	 * tsc_scale = (tsc_to_system_mul << (tsc_shift + 32)) / 100
+	 *
+	 * tsc_offset = (system_time -
+	 *               pvclock_scale_delta(tsc_timestamp, tsc_to_system_mul
+	 *                                   tsc_shift)) / 100
+	 *
+	 * Note that although tsc_to_system_mul is 32 bit, we may need 128 bit
+	 * division to calculate tsc_scale
+	 */
+	u64 tsc_scale_hi, tsc_scale_lo;
+	s64 tsc_offset;
+
+	/* impossible by definition of tsc_shift but we'd better check */
+	if (hv_clock->tsc_shift >= 32 || hv_clock->tsc_shift <= -32)
+		return -ERANGE;
+
+	tsc_scale_lo = hv_clock->tsc_to_system_mul;
+	tsc_scale_hi = tsc_scale_lo >> (32 - hv_clock->tsc_shift);
+	/* check if division will overflow */
+	if (tsc_scale_hi >= HV_NSEC_PER_TICK)
+		return -ERANGE;
+	tsc_scale_lo <<= (32 + hv_clock->tsc_shift);
+
+	__asm__ ("divq %[divisor]"
+		 : "+a"(tsc_scale_lo), "+d"(tsc_scale_hi)
+		 : [divisor]"rm"((u64)HV_NSEC_PER_TICK));
+	tsc_ref->tsc_scale = tsc_scale_lo;
+
+	tsc_offset = hv_clock->system_time;
+	tsc_offset -= pvclock_scale_delta(hv_clock->tsc_timestamp,
+					  hv_clock->tsc_to_system_mul,
+					  hv_clock->tsc_shift);
+	tsc_ref->tsc_offset = div_s64(tsc_offset, HV_NSEC_PER_TICK);
+
+	return 0;
+#else
+	return -ERANGE;
+#endif
+}
+
+void kvm_hv_tscpage_update(struct kvm_vcpu *v, bool use_master_clock)
+{
+	struct kvm *kvm = v->kvm;
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	HV_REFERENCE_TSC_PAGE tsc_ref;
+	u32 tsc_seq;
+	gpa_t gpa;
+	u64 gfn;
+
+	gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+	gpa = gfn_to_gpa(gfn);
+
+	if (kvm_read_guest(kvm, gpa, &tsc_ref, sizeof(tsc_ref)))
+		return;
+
+	/*
+	 * mark tsc page invalid either for the duration of contents update or
+	 * for good if it can't be used
+	 */
+	tsc_seq = tsc_ref.tsc_sequence;
+	tsc_ref.tsc_sequence = 0;
+
+	BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+	kvm_write_guest(kvm, gpa, &tsc_ref, sizeof(tsc_ref.tsc_sequence));
+	smp_wmb();
+
+	if (!use_master_clock)
+		return;
+
+	if (pvclock_to_tscpage(&v->arch.hv_clock, &tsc_ref))
+		return;
+
+	kvm_write_guest(kvm, gpa, &tsc_ref, sizeof(tsc_ref));
+	smp_wmb();
+
+	tsc_ref.tsc_sequence = tsc_seq + 1;
+	if (tsc_ref.tsc_sequence == 0xffffffff || tsc_ref.tsc_sequence == 0)
+		tsc_ref.tsc_sequence = 1;
+	trace_kvm_hv_tscpage_update(&tsc_ref);
+
+	kvm_write_guest(kvm, gpa, &tsc_ref, sizeof(tsc_ref.tsc_sequence));
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4..c2fd494 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
 
 void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
 
+bool kvm_hv_tscpage_need_update(struct kvm_vcpu *v);
+void kvm_hv_tscpage_update(struct kvm_vcpu *v, bool use_master_clock);
+
 #endif
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 2f1ea2f..7f76c87 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1292,6 +1292,31 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
 		  __entry->vcpu_id, __entry->timer_index)
 );
 
+/*
+ * Tracepoint for kvm_hv_tscpage_update.
+ */
+TRACE_EVENT(kvm_hv_tscpage_update,
+	TP_PROTO(HV_REFERENCE_TSC_PAGE *tsc_ref),
+	TP_ARGS(tsc_ref),
+
+	TP_STRUCT__entry(
+		__field(u32, tsc_sequence)
+		__field(u64, tsc_scale)
+		__field(s64, tsc_offset)
+	),
+
+	TP_fast_assign(
+		__entry->tsc_sequence = tsc_ref->tsc_sequence;
+		__entry->tsc_scale = tsc_ref->tsc_scale;
+		__entry->tsc_offset = tsc_ref->tsc_offset;
+	),
+
+	TP_printk("tsc_ref_page { sequence %u, scale 0x%llx, offset %lld }",
+		  __entry->tsc_sequence,
+		  __entry->tsc_scale,
+		  __entry->tsc_offset)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a4a32ab..f12130a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1833,7 +1833,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	local_irq_restore(flags);
 
-	if (!vcpu->pv_time_enabled)
+	if (!vcpu->pv_time_enabled && !kvm_hv_tscpage_need_update(v))
 		return 0;
 
 	if (kvm_has_tsc_control)
@@ -1851,7 +1851,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_guest_tsc = tsc_timestamp;
 
-	kvm_pvclock_update(v, use_master_clock);
+	if (vcpu->pv_time_enabled)
+		kvm_pvclock_update(v, use_master_clock);
+	if (kvm_hv_tscpage_need_update(v))
+		kvm_hv_tscpage_update(v, use_master_clock);
 	return 0;
 }
 
-- 
2.5.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux