[PATCH v2 09/11] KVM: get rid of pv_clock_gtod

Denis Plotnikov <dplotnikov@xxxxxxxxxxxxx> · Fri, 21 Jul 2017 18:45:16 +0300

Thanks to a set of recently added timekeeper functions providing
the cycles stamp along with the kernel time, now we have an ability
to get time values right from the kerenl avoiding supporting a shadow
copy of timekeeper data structures.

This reduces overheads and complexity of the KVM code and makes
time operations more clear.

Signed-off-by: Denis Plotnikov <dplotnikov@xxxxxxxxxxxxx>
---
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/trace.h            |  27 +++--
 arch/x86/kvm/x86.c              | 259 ++++++++++++----------------------------
 3 files changed, 89 insertions(+), 199 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 695605e..27a2df9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -781,7 +781,7 @@ struct kvm_arch {
 	u64 cur_tsc_generation;
 	int nr_vcpus_matched_tsc;
 
-	spinlock_t pvclock_gtod_sync_lock;
+	spinlock_t masterclock_lock;
 	bool use_master_clock;
 	u64 master_kernel_ns;
 	u64 master_cycle_now;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0a6cc67..5ed12fe 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -812,40 +812,41 @@ TRACE_EVENT(kvm_write_tsc_offset,
 	{VCLOCK_TSC,  "tsc"}				\
 
 TRACE_EVENT(kvm_update_master_clock,
-	TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
-	TP_ARGS(use_master_clock, host_clock, offset_matched),
+	TP_PROTO(bool use_master_clock, bool host_clock_stable,
+		bool offset_matched),
+	TP_ARGS(use_master_clock, host_clock_stable, offset_matched),
 
 	TP_STRUCT__entry(
 		__field(		bool,	use_master_clock	)
-		__field(	unsigned int,	host_clock		)
+		__field(		bool,	host_clock_stable	)
 		__field(		bool,	offset_matched		)
 	),
 
 	TP_fast_assign(
 		__entry->use_master_clock	= use_master_clock;
-		__entry->host_clock		= host_clock;
+		__entry->host_clock_stable	= host_clock_stable;
 		__entry->offset_matched		= offset_matched;
 	),
 
-	TP_printk("masterclock %d hostclock %s offsetmatched %u",
+	TP_printk("masterclock %d hostclock stable %u offsetmatched %u",
 		  __entry->use_master_clock,
-		  __print_symbolic(__entry->host_clock, host_clocks),
+		  __entry->host_clock_stable,
 		  __entry->offset_matched)
 );
 
 TRACE_EVENT(kvm_track_tsc,
 	TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
 		 unsigned int online_vcpus, bool use_master_clock,
-		 unsigned int host_clock),
+		 bool host_clock_stable),
 	TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
-		host_clock),
+		host_clock_stable),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	vcpu_id			)
 		__field(	unsigned int,	nr_vcpus_matched_tsc	)
 		__field(	unsigned int,	online_vcpus		)
 		__field(	bool,		use_master_clock	)
-		__field(	unsigned int,	host_clock		)
+		__field(	bool,		host_clock_stable	)
 	),
 
 	TP_fast_assign(
@@ -853,14 +854,14 @@ TRACE_EVENT(kvm_track_tsc,
 		__entry->nr_vcpus_matched_tsc	= nr_matched;
 		__entry->online_vcpus		= online_vcpus;
 		__entry->use_master_clock	= use_master_clock;
-		__entry->host_clock		= host_clock;
+		__entry->host_clock_stable	= host_clock_stable;
 	),
 
-	TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
-		  " hostclock %s",
+	TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u "
+		  "hostclock stable %u",
 		  __entry->vcpu_id, __entry->use_master_clock,
 		  __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
-		  __print_symbolic(__entry->host_clock, host_clocks))
+		  __entry->host_clock_stable)
 );
 
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0e846f0..ce491bb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -50,7 +50,7 @@
 #include <linux/hash.h>
 #include <linux/pci.h>
 #include <linux/timekeeper_internal.h>
-#include <linux/pvclock_gtod.h>
+#include <linux/cs_notifier.h>
 #include <linux/kvm_irqfd.h>
 #include <linux/irqbypass.h>
 #include <linux/sched/stat.h>
@@ -1131,50 +1131,6 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 	return kvm_set_msr(vcpu, &msr);
 }
 
-#ifdef CONFIG_X86_64
-struct pvclock_gtod_data {
-	seqcount_t	seq;
-
-	struct { /* extract of a clocksource struct */
-		int vclock_mode;
-		u64	cycle_last;
-		u64	mask;
-		u32	mult;
-		u32	shift;
-	} clock;
-
-	u64		boot_ns;
-	u64		nsec_base;
-	u64		wall_time_sec;
-};
-
-static struct pvclock_gtod_data pvclock_gtod_data;
-
-static void update_pvclock_gtod(struct timekeeper *tk)
-{
-	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
-	u64 boot_ns;
-
-	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
-
-	write_seqcount_begin(&vdata->seq);
-
-	/* copy pvclock gtod data */
-	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
-	vdata->clock.mask		= tk->tkr_mono.mask;
-	vdata->clock.mult		= tk->tkr_mono.mult;
-	vdata->clock.shift		= tk->tkr_mono.shift;
-
-	vdata->boot_ns			= boot_ns;
-	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
-
-	vdata->wall_time_sec            = tk->xtime_sec;
-
-	write_seqcount_end(&vdata->seq);
-}
-#endif
-
 void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
 {
 	/*
@@ -1266,10 +1222,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
 		 __func__, base_hz, scaled_hz, shift, *pmultiplier);
 }
 
-#ifdef CONFIG_X86_64
-static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
-#endif
-
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 static unsigned long max_tsc_khz;
 
@@ -1358,12 +1310,32 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
 	return tsc;
 }
 
+#ifdef CONFIG_X86_64
+static bool clocksource_stable(void)
+{
+	return get_tk_mono_clock_mode() == VCLOCK_TSC;
+}
+
+static bool clocksource_stability_check(void)
+{
+	unsigned int seq;
+	const seqcount_t *s = get_tk_seq();
+	bool stable;
+
+	{
+		seq = read_seqcount_begin(s);
+		stable = clocksource_stable();
+	} while (unlikely(read_seqcount_retry(s, seq)));
+
+	return stable;
+}
+#endif
+
 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
-	bool vcpus_matched;
+	bool vcpus_matched, clocksource_stable;
 	struct kvm_arch *ka = &vcpu->kvm->arch;
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 
 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
 			 atomic_read(&vcpu->kvm->online_vcpus));
@@ -1376,13 +1348,14 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 	 * and the vcpus need to have matched TSCs.  When that happens,
 	 * perform request to enable masterclock.
 	 */
+	clocksource_stable = clocksource_stability_check();
 	if (ka->use_master_clock ||
-	    (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
+		(clocksource_stable && vcpus_matched))
 		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
 
 	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
-			    atomic_read(&vcpu->kvm->online_vcpus),
-		            ka->use_master_clock, gtod->clock.vclock_mode);
+				atomic_read(&vcpu->kvm->online_vcpus),
+				ka->use_master_clock, clocksource_stable);
 #endif
 }
 
@@ -1535,7 +1508,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	kvm_vcpu_write_tsc_offset(vcpu, offset);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
-	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+	spin_lock(&kvm->arch.masterclock_lock);
 	if (!matched) {
 		kvm->arch.nr_vcpus_matched_tsc = 0;
 	} else if (!already_matched) {
@@ -1543,7 +1516,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	}
 
 	kvm_track_tsc_matching(vcpu);
-	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
+	spin_unlock(&kvm->arch.masterclock_lock);
 }
 
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
@@ -1563,99 +1536,41 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 }
 
 #ifdef CONFIG_X86_64
-
-static u64 read_tsc(void)
+static bool kvm_get_host_time_and_cycles(s64 *kernel_ns, u64 *cycle_now,
+						u64 (*get_time)(u64 *cycle_now))
 {
-	u64 ret = (u64)rdtsc_ordered();
-	u64 last = pvclock_gtod_data.clock.cycle_last;
+	unsigned int seq;
+	const seqcount_t *s = get_tk_seq();
+	bool stable;
 
-	if (likely(ret >= last))
-		return ret;
+	{
+		seq = read_seqcount_begin(s);
+		stable = clocksource_stable();
+		if (stable)
+			*kernel_ns = get_time(cycle_now);
+	} while (unlikely(read_seqcount_retry(s, seq)));
 
-	/*
-	 * GCC likes to generate cmov here, but this branch is extremely
-	 * predictable (it's just a function of time and the likely is
-	 * very likely) and there's a data dependence, so force GCC
-	 * to generate a branch instead.  I don't barrier() because
-	 * we don't actually need a barrier, and if this function
-	 * ever gets inlined it will generate worse code.
-	 */
-	asm volatile ("");
-	return last;
-}
-
-static inline u64 vgettsc(u64 *cycle_now)
-{
-	long v;
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-
-	*cycle_now = read_tsc();
-
-	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
-	return v * gtod->clock.mult;
-}
-
-static int do_monotonic_boot(s64 *t, u64 *cycle_now)
-{
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-	unsigned long seq;
-	int mode;
-	u64 ns;
-
-	do {
-		seq = read_seqcount_begin(&gtod->seq);
-		mode = gtod->clock.vclock_mode;
-		ns = gtod->nsec_base;
-		ns += vgettsc(cycle_now);
-		ns >>= gtod->clock.shift;
-		ns += gtod->boot_ns;
-	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-	*t = ns;
-
-	return mode;
-}
-
-static int do_realtime(struct timespec *ts, u64 *cycle_now)
-{
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-	unsigned long seq;
-	int mode;
-	u64 ns;
-
-	do {
-		seq = read_seqcount_begin(&gtod->seq);
-		mode = gtod->clock.vclock_mode;
-		ts->tv_sec = gtod->wall_time_sec;
-		ns = gtod->nsec_base;
-		ns += vgettsc(cycle_now);
-		ns >>= gtod->clock.shift;
-	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-
-	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-	ts->tv_nsec = ns;
-
-	return mode;
+	return stable;
 }
 
 /* returns true if host is using tsc clocksource */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
 {
-	/* checked again under seqlock below */
-	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
-		return false;
-
-	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
+	return kvm_get_host_time_and_cycles(
+		kernel_ns, cycle_now, ktime_get_boot_ns_with_cycles);
 }
 
 /* returns true if host is using tsc clocksource */
-static bool kvm_get_walltime_and_clockread(struct timespec *ts,
-					   u64 *cycle_now)
+static bool kvm_get_walltime_and_clockread(struct timespec *ts, u64 *cycle_now)
 {
-	/* checked again under seqlock below */
-	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
-		return false;
+	bool res;
+	s64 kernel_ns;
 
-	return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+	res = kvm_get_host_time_and_cycles(
+		&kernel_ns, cycle_now, ktime_get_real_ns_with_cycles);
+	*ts = ktime_to_timespec(kernel_ns);
+
+	return res;
 }
 #endif
 
@@ -1700,12 +1615,11 @@ static bool kvm_get_walltime_and_clockread(struct timespec *ts,
  *
  */
 
-static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+static void update_masterclock(struct kvm *kvm)
 {
 #ifdef CONFIG_X86_64
 	struct kvm_arch *ka = &kvm->arch;
-	int vclock_mode;
-	bool host_tsc_clocksource, vcpus_matched;
+	bool host_clocksource_stable, vcpus_matched;
 
 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
 			atomic_read(&kvm->online_vcpus));
@@ -1714,20 +1628,16 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
 	 * If the host uses TSC clock, then passthrough TSC as stable
 	 * to the guest.
 	 */
-	host_tsc_clocksource = kvm_get_time_and_clockread(
+	host_clocksource_stable = kvm_get_time_and_clockread(
 					&ka->master_kernel_ns,
 					&ka->master_cycle_now);
 
-	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
+	ka->use_master_clock = host_clocksource_stable && vcpus_matched
 				&& !backwards_tsc_observed
 				&& !ka->boot_vcpu_runs_old_kvmclock;
 
-	if (ka->use_master_clock)
-		atomic_set(&kvm_guest_has_master_clock, 1);
-
-	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
-	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
-					vcpus_matched);
+	trace_kvm_update_master_clock(ka->use_master_clock,
+					host_clocksource_stable, vcpus_matched);
 #endif
 }
 
@@ -1743,10 +1653,10 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 	struct kvm_vcpu *vcpu;
 	struct kvm_arch *ka = &kvm->arch;
 
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	spin_lock(&ka->masterclock_lock);
 	kvm_make_mclock_inprogress_request(kvm);
 	/* no guest entries from this point */
-	pvclock_update_vm_gtod_copy(kvm);
+	update_masterclock(kvm);
 
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -1755,7 +1665,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
 
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	spin_unlock(&ka->masterclock_lock);
 #endif
 }
 
@@ -1765,15 +1675,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
 	struct pvclock_vcpu_time_info hv_clock;
 	u64 ret;
 
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	spin_lock(&ka->masterclock_lock);
 	if (!ka->use_master_clock) {
-		spin_unlock(&ka->pvclock_gtod_sync_lock);
+		spin_unlock(&ka->masterclock_lock);
 		return ktime_get_boot_ns() + ka->kvmclock_offset;
 	}
 
 	hv_clock.tsc_timestamp = ka->master_cycle_now;
 	hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	spin_unlock(&ka->masterclock_lock);
 
 	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
 	get_cpu();
@@ -1859,13 +1769,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	 * If the host uses TSC clock, then passthrough TSC as stable
 	 * to the guest.
 	 */
-	spin_lock(&ka->pvclock_gtod_sync_lock);
+	spin_lock(&ka->masterclock_lock);
 	use_master_clock = ka->use_master_clock;
 	if (use_master_clock) {
 		host_tsc = ka->master_cycle_now;
 		kernel_ns = ka->master_kernel_ns;
 	}
-	spin_unlock(&ka->pvclock_gtod_sync_lock);
+	spin_unlock(&ka->masterclock_lock);
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
@@ -6015,7 +5925,8 @@ static void kvm_set_mmio_spte_mask(void)
 }
 
 #ifdef CONFIG_X86_64
-static void pvclock_gtod_update_fn(struct work_struct *work)
+static int process_clocksource_change(struct notifier_block *nb,
+					unsigned long unused0, void *unused1)
 {
 	struct kvm *kvm;
 
@@ -6026,35 +5937,13 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
 	list_for_each_entry(kvm, &vm_list, vm_list)
 		kvm_for_each_vcpu(i, vcpu, kvm)
 			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
-	atomic_set(&kvm_guest_has_master_clock, 0);
 	spin_unlock(&kvm_lock);
-}
-
-static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
-
-/*
- * Notification about pvclock gtod data update.
- */
-static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
-			       void *priv)
-{
-	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
-	struct timekeeper *tk = priv;
-
-	update_pvclock_gtod(tk);
-
-	/* disable master clock if host does not trust, or does not
-	 * use, TSC clocksource
-	 */
-	if (gtod->clock.vclock_mode != VCLOCK_TSC &&
-	    atomic_read(&kvm_guest_has_master_clock) != 0)
-		queue_work(system_long_wq, &pvclock_gtod_work);
-
 	return 0;
 }
 
-static struct notifier_block pvclock_gtod_notifier = {
-	.notifier_call = pvclock_gtod_notify,
+
+static struct notifier_block clocksource_changes_notifier = {
+	.notifier_call = process_clocksource_change,
 };
 #endif
 
@@ -6107,7 +5996,7 @@ int kvm_arch_init(void *opaque)
 
 	kvm_lapic_init();
 #ifdef CONFIG_X86_64
-	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+	clocksource_changes_register_notifier(&clocksource_changes_notifier);
 #endif
 
 	return 0;
@@ -6128,7 +6017,7 @@ void kvm_arch_exit(void)
 					    CPUFREQ_TRANSITION_NOTIFIER);
 	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
 #ifdef CONFIG_X86_64
-	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+	clocksource_changes_unregister_notifier(&clocksource_changes_notifier);
 #endif
 	kvm_x86_ops = NULL;
 	kvm_mmu_module_exit();
@@ -8031,10 +7920,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
 	mutex_init(&kvm->arch.apic_map_lock);
 	mutex_init(&kvm->arch.hyperv.hv_lock);
-	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+	spin_lock_init(&kvm->arch.masterclock_lock);
 
 	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
-	pvclock_update_vm_gtod_copy(kvm);
+	update_masterclock(kvm);
 
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
-- 
2.7.4