[RFC PATCH 2/2] KVM: x86: add support of kvm-clock stablity in L2

Denis Plotnikov <dplotnikov@xxxxxxxxxxxxx> · Wed, 28 Jun 2017 13:55:01 +0300

Get rid of complex shadow monotonic timekeeper support in KVM.
Extend and use timekeeper infrastructure instead.

Make kvm-clock stable in L2 using the changed timekeeper

Signed-off-by: Denis Plotnikov <dplotnikov@xxxxxxxxxxxxx>
---
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/x86.c              | 294 +++++++++++-----------------------------
 include/linux/timekeeping.h     |  19 ++-
 kernel/time/timekeeping.c       |  22 ++-
 4 files changed, 117 insertions(+), 220 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 695605e..66d678c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -781,7 +781,7 @@ struct kvm_arch {
 	u64 cur_tsc_generation;
 	int nr_vcpus_matched_tsc;
 
-	spinlock_t pvclock_gtod_sync_lock;
+	spinlock_t master_clock_lock;
 	bool use_master_clock;
 	u64 master_kernel_ns;
 	u64 master_cycle_now;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 87d3cb9..49ae57fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -50,6 +50,7 @@
 #include <linux/hash.h>
 #include <linux/pci.h>
 #include <linux/timekeeper_internal.h>
+#include <linux/timekeeping.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
@@ -1131,50 +1132,6 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 	return kvm_set_msr(vcpu, &msr);
 }
 
-#ifdef CONFIG_X86_64
-struct pvclock_gtod_data {
-	seqcount_t	seq;
-
-	struct { /* extract of a clocksource struct */
-		int vclock_mode;
-		u64	cycle_last;
-		u64	mask;
-		u32	mult;
-		u32	shift;
-	} clock;
-
-	u64		boot_ns;
-	u64		nsec_base;
-	u64		wall_time_sec;
-};
-
-static struct pvclock_gtod_data pvclock_gtod_data;
-
-static void update_pvclock_gtod(struct timekeeper *tk)
-{
-	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
-	u64 boot_ns;
-
-	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
-
-	write_seqcount_begin(&vdata->seq);
-
-	/* copy pvclock gtod data */
-	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
-	vdata->clock.mask		= tk->tkr_mono.mask;
-	vdata->clock.mult		= tk->tkr_mono.mult;
-	vdata->clock.shift		= tk->tkr_mono.shift;
-
-	vdata->boot_ns			= boot_ns;
-	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
-
-	vdata->wall_time_sec            = tk->xtime_sec;
-
-	write_seqcount_end(&vdata->seq);
-}
-#endif
-
 void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -1266,10 +1223,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
 		 __func__, base_hz, scaled_hz, shift, *pmultiplier);
 }
 
-#ifdef CONFIG_X86_64
-static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
-#endif
-
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 static unsigned long max_tsc_khz;
 
@@ -1358,31 +1311,52 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 	return tsc;
 }
 
+static bool pvclock_stable(void)
+{
+	struct pvclock_vcpu_time_info *pvti = &pvclock_pvti_cpu0_va()->pvti;
+	return pvclock_read_flags(pvti) & PVCLOCK_TSC_STABLE_BIT;
+}
+
 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
 	bool vcpus_matched;
 	struct kvm_arch *ka = &vcpu->kvm->arch;
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+	unsigned seq;
+	const seqcount_t *s = get_tk_seq();
+	int vclock_mode;
+	bool stable;
 
 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
 			 atomic_read(&vcpu->kvm->online_vcpus));
 
-	/*
-	 * Once the masterclock is enabled, always perform request in
-	 * order to update it.
-	 *
-	 * In order to enable masterclock, the host clocksource must be TSC
-	 * and the vcpus need to have matched TSCs.  When that happens,
-	 * perform request to enable masterclock.
-	 */
-	if (ka->use_master_clock ||
-	    (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
+	{
+		seq = read_seqcount_begin(s);
+		vclock_mode = get_tk_mono_clock_mode();
+		stable = false;
+		/*
+		 * Once the masterclock is enabled, always perform request in
+		 * order to update it.
+		 *
+		 * In order to enable masterclock, the host clocksource must be TSC
+		 * or stable paravirtualized clocksource and the vcpus need to
+		 * have matched TSCs.
+		 * When that happens, perform request to enable masterclock.
+		 */
+		if (ka->use_master_clock ||
+			((vclock_mode == VCLOCK_TSC ||
+			(vclock_mode == VCLOCK_PVCLOCK && pvclock_stable())) &&
+			vcpus_matched))
+			stable = true;
+	} while (unlikely(read_seqcount_retry(s, seq)));
+
+	if (stable)
 		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 
 	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
 			    atomic_read(&vcpu->kvm->online_vcpus),
-		            ka->use_master_clock, gtod->clock.vclock_mode);
+		            ka->use_master_clock, vclock_mode);
 #endif
 }
 
@@ -1535,7 +1509,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	kvm_vcpu_write_tsc_offset(vcpu, offset);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+	spin_lock(&kvm->arch.master_clock_lock);
 	if (!matched) {
 		kvm->arch.nr_vcpus_matched_tsc = 0;
 	} else if (!already_matched) {
@@ -1543,7 +1517,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	}
 
 	kvm_track_tsc_matching(vcpu);
-	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+	spin_unlock(&kvm->arch.master_clock_lock);
 }
 
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
@@ -1563,99 +1537,45 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 }
 
 #ifdef CONFIG_X86_64
-
-static u64 read_tsc(void)
-{
-	u64 ret = (u64)rdtsc_ordered();
-	u64 last = pvclock_gtod_data.clock.cycle_last;
-
-	if (likely(ret >= last))
-		return ret;
-
-	/*
-	 * GCC likes to generate cmov here, but this branch is extremely
-	 * predictable (it's just a function of time and the likely is
-	 * very likely) and there's a data dependence, so force GCC
-	 * to generate a branch instead.  I don't barrier() because
-	 * we don't actually need a barrier, and if this function
-	 * ever gets inlined it will generate worse code.
-	 */
-	asm volatile ("");
-	return last;
-}
-
-static inline u64 vgettsc(u64 *cycle_now)
-{
-	long v;
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-
-	*cycle_now = read_tsc();
-
-	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
-	return v * gtod->clock.mult;
-}
-
-static int do_monotonic_boot(s64 *t, u64 *cycle_now)
-{
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-	unsigned long seq;
-	int mode;
-	u64 ns;
-
-	do {
-		seq = read_seqcount_begin(&gtod->seq);
-		mode = gtod->clock.vclock_mode;
-		ns = gtod->nsec_base;
-		ns += vgettsc(cycle_now);
-		ns >>= gtod->clock.shift;
-		ns += gtod->boot_ns;
-	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-	*t = ns;
-
-	return mode;
-}
-
-static int do_realtime(struct timespec *ts, u64 *cycle_now)
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_host_time_and_cycles(s64 *kernel_ns, u64 *cycle_now,
+					u64 (*get_time)(u64 *cycle_now))
 {
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-	unsigned long seq;
-	int mode;
-	u64 ns;
-
-	do {
-		seq = read_seqcount_begin(&gtod->seq);
-		mode = gtod->clock.vclock_mode;
-		ts->tv_sec = gtod->wall_time_sec;
-		ns = gtod->nsec_base;
-		ns += vgettsc(cycle_now);
-		ns >>= gtod->clock.shift;
-	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-
-	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-	ts->tv_nsec = ns;
+	unsigned seq;
+	const seqcount_t *s = get_tk_seq();
+	int vclock_mode;
+	bool res;
+
+	{
+		res = false;
+		seq = read_seqcount_begin(s);
+		vclock_mode = get_tk_mono_clock_mode();
+		if (vclock_mode == VCLOCK_TSC ||
+			(vclock_mode == VCLOCK_PVCLOCK && pvclock_stable())) {
+			*kernel_ns = get_time(cycle_now);
+			res = true;
+		}
+	} while (unlikely(read_seqcount_retry(s, seq)));
 
-	return mode;
+	return res;
 }
 
-/* returns true if host is using tsc clocksource */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 {
-	/* checked again under seqlock below */
-	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
-		return false;
-
-	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
+	return kvm_get_host_time_and_cycles(
+		kernel_ns, cycle_now, ktime_get_boot_ns_with_cycles);
 }
 
-/* returns true if host is using tsc clocksource */
-static bool kvm_get_walltime_and_clockread(struct timespec *ts,
-					   u64 *cycle_now)
+static bool kvm_get_walltime_and_clockread(struct timespec *ts, u64 *cycle_now)
 {
-	/* checked again under seqlock below */
-	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
-		return false;
+	bool res;
+	s64 kernel_ns;
 
-	return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+	res = kvm_get_host_time_and_cycles(
+		&kernel_ns, cycle_now, ktime_get_real_ns_with_cycles);
+	*ts = ktime_to_timespec(kernel_ns);
+
+	return res;
 }
 #endif
 
@@ -1700,19 +1620,18 @@ static bool kvm_get_walltime_and_clockread(struct timespec *ts,
  *
  */
 
-static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+static void update_masterclock_data(struct kvm *kvm)
 {
 #ifdef CONFIG_X86_64
 	struct kvm_arch *ka = &kvm->arch;
-	int vclock_mode;
 	bool host_tsc_clocksource, vcpus_matched;
 
 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
 			atomic_read(&kvm->online_vcpus));
 
 	/*
-	 * If the host uses TSC clock, then passthrough TSC as stable
-	 * to the guest.
+	 * If the host uses TSC clock or a stable paravirtualized clock,
+	 * then passthrough the clock as stable to the guest.
 	 */
 	host_tsc_clocksource = kvm_get_time_and_clockread(
 					&ka->master_kernel_ns,
@@ -1721,13 +1640,6 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
 				&& !backwards_tsc_observed
 				&& !ka->boot_vcpu_runs_old_kvmclock;
-
-	if (ka->use_master_clock)
-		atomic_set(&kvm_guest_has_master_clock, 1);
-
-	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
-	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
-					vcpus_matched);
 #endif
 }
 
@@ -1743,10 +1655,10 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 	struct kvm_vcpu *vcpu;
 	struct kvm_arch *ka = &kvm->arch;
 
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	spin_lock(&ka->master_clock_lock);
 	kvm_make_mclock_inprogress_request(kvm);
 	/* no guest entries from this point */
-	pvclock_update_vm_gtod_copy(kvm);
+	update_masterclock_data(kvm);
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1755,7 +1667,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
 
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	spin_unlock(&ka->master_clock_lock);
 #endif
 }
 
@@ -1765,15 +1677,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
 	struct pvclock_vcpu_time_info hv_clock;
 	u64 ret;
 
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	spin_lock(&ka->master_clock_lock);
 	if (!ka->use_master_clock) {
-		spin_unlock(&ka->pvclock_gtod_sync_lock);
+		spin_unlock(&ka->master_clock_lock);
 		return ktime_get_boot_ns() + ka->kvmclock_offset;
 	}
 
 	hv_clock.tsc_timestamp = ka->master_cycle_now;
 	hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	spin_unlock(&ka->master_clock_lock);
 
 	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
 	get_cpu();
@@ -1859,13 +1771,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	 * If the host uses TSC clock, then passthrough TSC as stable
 	 * to the guest.
 	 */
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	spin_lock(&ka->master_clock_lock);
 	use_master_clock = ka->use_master_clock;
 	if (use_master_clock) {
 		host_tsc = ka->master_cycle_now;
 		kernel_ns = ka->master_kernel_ns;
 	}
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	spin_unlock(&ka->master_clock_lock);
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
@@ -6012,50 +5924,6 @@ static void kvm_set_mmio_spte_mask(void)
 	kvm_mmu_set_mmio_spte_mask(mask);
 }
 
-#ifdef CONFIG_X86_64
-static void pvclock_gtod_update_fn(struct work_struct *work)
-{
-	struct kvm *kvm;
-
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	spin_lock(&kvm_lock);
-	list_for_each_entry(kvm, &vm_list, vm_list)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
-	atomic_set(&kvm_guest_has_master_clock, 0);
-	spin_unlock(&kvm_lock);
-}
-
-static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
-
-/*
- * Notification about pvclock gtod data update.
- */
-static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
-			       void *priv)
-{
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-	struct timekeeper *tk = priv;
-
-	update_pvclock_gtod(tk);
-
-	/* disable master clock if host does not trust, or does not
-	 * use, TSC clocksource
-	 */
-	if (gtod->clock.vclock_mode != VCLOCK_TSC &&
-	    atomic_read(&kvm_guest_has_master_clock) != 0)
-		queue_work(system_long_wq, &pvclock_gtod_work);
-
-	return 0;
-}
-
-static struct notifier_block pvclock_gtod_notifier = {
-	.notifier_call = pvclock_gtod_notify,
-};
-#endif
-
 int kvm_arch_init(void *opaque)
 {
 	int r;
@@ -6104,9 +5972,6 @@ int kvm_arch_init(void *opaque)
 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
 
 	kvm_lapic_init();
-#ifdef CONFIG_X86_64
-	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
-#endif
 
 	return 0;
 
@@ -6125,9 +5990,6 @@ void kvm_arch_exit(void)
 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 					    CPUFREQ_TRANSITION_NOTIFIER);
 	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
-#ifdef CONFIG_X86_64
-	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
-#endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
 	free_percpu(shared_msrs);
@@ -8029,10 +7891,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 	mutex_init(&kvm->arch.apic_map_lock);
 	mutex_init(&kvm->arch.hyperv.hv_lock);
-	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+	spin_lock_init(&kvm->arch.master_clock_lock);
 
 	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
-	pvclock_update_vm_gtod_copy(kvm);
+	update_masterclock_data(kvm);
 
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 31df92c..b0a06b0 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -171,7 +171,10 @@ enum tk_offsets {
 };
 
 extern ktime_t ktime_get(void);
+extern ktime_t ktime_get_with_cycles(u64 *cycles);
 extern ktime_t ktime_get_with_offset(enum tk_offsets offs, u64 *cycles);
+extern const seqcount_t* get_tk_seq(void);
+extern int get_tk_mono_clock_mode(void);
 extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
 extern ktime_t ktime_get_raw(void);
 extern u32 ktime_get_resolution_ns(void);
@@ -184,6 +187,10 @@ static inline ktime_t ktime_get_real(void)
 	return ktime_get_with_offset(TK_OFFS_REAL, NULL);
 }
 
+static inline ktime_t ktime_get_real_with_cycles(u64 *cycles)
+{
+	return ktime_get_with_offset(TK_OFFS_REAL, cycles);
+}
 /**
  * ktime_get_boottime - Returns monotonic time since boot in ktime_t format
  *
@@ -220,17 +227,27 @@ static inline u64 ktime_get_ns(void)
 	return ktime_to_ns(ktime_get());
 }
 
+static inline u64 ktime_get_ns_with_cycles(u64 *cycles)
+{
+	return ktime_to_ns(ktime_get_with_cycles(cycles));
+}
+
 static inline u64 ktime_get_real_ns(void)
 {
 	return ktime_to_ns(ktime_get_real());
 }
 
+static inline u64 ktime_get_real_ns_with_cycles(u64 *cycles)
+{
+	return ktime_to_ns(ktime_get_real_with_cycles(cycles));
+}
+
 static inline u64 ktime_get_boot_ns(void)
 {
 	return ktime_to_ns(ktime_get_boottime());
 }
 
-static inline u64 ktime_get_boot_ns_and_cycles(u64 *cycles)
+static inline u64 ktime_get_boot_ns_with_cycles(u64 *cycles)
 {
 	return ktime_to_ns(ktime_get_boottime_and_cycles(cycles));
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 994f83b..7dbcac4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -717,6 +717,12 @@ EXPORT_SYMBOL(getnstimeofday64);
 
 ktime_t ktime_get(void)
 {
+	return ktime_get_with_cycles(NULL);
+}
+EXPORT_SYMBOL_GPL(ktime_get);
+
+ktime_t ktime_get_with_cycles(u64 *cycles)
+{
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
 	ktime_t base;
@@ -727,13 +733,13 @@ ktime_t ktime_get(void)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 		base = tk->tkr_mono.base;
-		nsecs = timekeeping_get_ns(&tk->tkr_mono, NULL);
+		nsecs = timekeeping_get_ns(&tk->tkr_mono, cycles);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
 	return ktime_add_ns(base, nsecs);
 }
-EXPORT_SYMBOL_GPL(ktime_get);
+EXPORT_SYMBOL_GPL(ktime_get_with_cycles);
 
 u32 ktime_get_resolution_ns(void)
 {
@@ -779,6 +785,18 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs, u64 *tsc_stamp)
 }
 EXPORT_SYMBOL_GPL(ktime_get_with_offset);
 
+const seqcount_t* get_tk_seq()
+{
+	return &tk_core.seq;
+}
+EXPORT_SYMBOL(get_tk_seq);
+
+int get_tk_mono_clock_mode()
+{
+	return tk_core.timekeeper.tkr_mono.clock->archdata.vclock_mode;
+}
+EXPORT_SYMBOL(get_tk_mono_clock_mode);
+
 /**
  * ktime_mono_to_any() - convert mononotic time to any other time
  * @tmono:	time to convert.
-- 
2.7.4