Re: [PATCH v4 07/10] KVM: x86: remove not used pvclock_gtod_copy

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Wed, 2 Aug 2017 20:21:34 -0300

Hi Denis,

I'm all for this as well, the original submission suggested something
similar, someone said "use a scheme similar to vsyscalls", 
therefore the internal copy of the fields.

More comments below.

On Wed, Aug 02, 2017 at 05:38:07PM +0300, Denis Plotnikov wrote:
> Since, KVM has been switched to getting masterclock related data
> right from the timekeeper by the previous patches, now we are able
> to remove all the parts related to the old scheme of getting
> masterclock data.
> 
> This patch removes those parts.
> 
> Signed-off-by: Denis Plotnikov <dplotnikov@xxxxxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |   2 +-
>  arch/x86/kvm/trace.h            |  31 ++----
>  arch/x86/kvm/x86.c              | 216 ++++++----------------------------------
>  3 files changed, 42 insertions(+), 207 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 87ac4fb..91465db 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -791,7 +791,7 @@ struct kvm_arch {
>  	u64 cur_tsc_generation;
>  	int nr_vcpus_matched_tsc;
>  
> -	spinlock_t pvclock_gtod_sync_lock;
> +	spinlock_t masterclock_lock;
>  	bool use_master_clock;
>  	u64 master_kernel_ns;
>  	u64 master_cycle_now;
> diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
> index 0a6cc67..923ab31 100644
> --- a/arch/x86/kvm/trace.h
> +++ b/arch/x86/kvm/trace.h
> @@ -807,45 +807,39 @@ TRACE_EVENT(kvm_write_tsc_offset,
>  
>  #ifdef CONFIG_X86_64
>  
> -#define host_clocks					\
> -	{VCLOCK_NONE, "none"},				\
> -	{VCLOCK_TSC,  "tsc"}				\
> -
>  TRACE_EVENT(kvm_update_master_clock,
> -	TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
> -	TP_ARGS(use_master_clock, host_clock, offset_matched),
> +	TP_PROTO(bool use_master_clock, bool host_clock_stable,
> +		bool offset_matched),
> +	TP_ARGS(use_master_clock, host_clock_stable, offset_matched),
>  
>  	TP_STRUCT__entry(
>  		__field(		bool,	use_master_clock	)
> -		__field(	unsigned int,	host_clock		)
> +		__field(		bool,	host_clock_stable	)
>  		__field(		bool,	offset_matched		)
>  	),
>  
>  	TP_fast_assign(
>  		__entry->use_master_clock	= use_master_clock;
> -		__entry->host_clock		= host_clock;
> +		__entry->host_clock_stable	= host_clock_stable;
>  		__entry->offset_matched		= offset_matched;
>  	),
>  
> -	TP_printk("masterclock %d hostclock %s offsetmatched %u",
> +	TP_printk("masterclock %d hostclock stable %u offsetmatched %u",
>  		  __entry->use_master_clock,
> -		  __print_symbolic(__entry->host_clock, host_clocks),
> +		  __entry->host_clock_stable,
>  		  __entry->offset_matched)
>  );
>  
>  TRACE_EVENT(kvm_track_tsc,
>  	TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
> -		 unsigned int online_vcpus, bool use_master_clock,
> -		 unsigned int host_clock),
> -	TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
> -		host_clock),
> +		 unsigned int online_vcpus, bool use_master_clock),
> +	TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock),
>  
>  	TP_STRUCT__entry(
>  		__field(	unsigned int,	vcpu_id			)
>  		__field(	unsigned int,	nr_vcpus_matched_tsc	)
>  		__field(	unsigned int,	online_vcpus		)
>  		__field(	bool,		use_master_clock	)
> -		__field(	unsigned int,	host_clock		)
>  	),
>  
>  	TP_fast_assign(
> @@ -853,14 +847,11 @@ TRACE_EVENT(kvm_track_tsc,
>  		__entry->nr_vcpus_matched_tsc	= nr_matched;
>  		__entry->online_vcpus		= online_vcpus;
>  		__entry->use_master_clock	= use_master_clock;
> -		__entry->host_clock		= host_clock;
>  	),
>  
> -	TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
> -		  " hostclock %s",
> +	TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u",
>  		  __entry->vcpu_id, __entry->use_master_clock,
> -		  __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
> -		  __print_symbolic(__entry->host_clock, host_clocks))
> +		  __entry->nr_vcpus_matched_tsc, __entry->online_vcpus)
>  );
>  
>  #endif /* CONFIG_X86_64 */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index d8ec2ca..53754fa 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -50,7 +50,7 @@
>  #include <linux/hash.h>
>  #include <linux/pci.h>
>  #include <linux/timekeeper_internal.h>
> -#include <linux/pvclock_gtod.h>
> +#include <linux/cs_notifier.h>
>  #include <linux/kvm_irqfd.h>
>  #include <linux/irqbypass.h>
>  #include <linux/sched/stat.h>
> @@ -1134,50 +1134,6 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
>  	return kvm_set_msr(vcpu, &msr);
>  }
>  
> -#ifdef CONFIG_X86_64
> -struct pvclock_gtod_data {
> -	seqcount_t	seq;
> -
> -	struct { /* extract of a clocksource struct */
> -		int vclock_mode;
> -		u64	cycle_last;
> -		u64	mask;
> -		u32	mult;
> -		u32	shift;
> -	} clock;
> -
> -	u64		boot_ns;
> -	u64		nsec_base;
> -	u64		wall_time_sec;
> -};
> -
> -static struct pvclock_gtod_data pvclock_gtod_data;
> -
> -static void update_pvclock_gtod(struct timekeeper *tk)
> -{
> -	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
> -	u64 boot_ns;
> -
> -	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
> -
> -	write_seqcount_begin(&vdata->seq);
> -
> -	/* copy pvclock gtod data */
> -	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
> -	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
> -	vdata->clock.mask		= tk->tkr_mono.mask;
> -	vdata->clock.mult		= tk->tkr_mono.mult;
> -	vdata->clock.shift		= tk->tkr_mono.shift;
> -
> -	vdata->boot_ns			= boot_ns;
> -	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
> -
> -	vdata->wall_time_sec            = tk->xtime_sec;
> -
> -	write_seqcount_end(&vdata->seq);
> -}
> -#endif
> -
>  void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
>  {
>  	/*
> @@ -1269,10 +1225,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
>  		 __func__, base_hz, scaled_hz, shift, *pmultiplier);
>  }
>  
> -#ifdef CONFIG_X86_64
> -static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
> -#endif
> -
>  static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
>  static unsigned long max_tsc_khz;
>  
> @@ -1366,7 +1318,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
>  #ifdef CONFIG_X86_64
>  	bool vcpus_matched;
>  	struct kvm_arch *ka = &vcpu->kvm->arch;
> -	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
>  
>  	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
>  			 atomic_read(&vcpu->kvm->online_vcpus));
> @@ -1379,13 +1330,12 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
>  	 * and the vcpus need to have matched TSCs.  When that happens,
>  	 * perform request to enable masterclock.
>  	 */
> -	if (ka->use_master_clock ||
> -	    (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
> +	if (ka->use_master_clock || vcpus_matched)
>  		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);

Don't drop this. The masterclock scheme requires TSC for proper functioning 
(or an analysis why its supposed with different HPET+TSC, for example).

>  
>  	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
> -			    atomic_read(&vcpu->kvm->online_vcpus),
> -		            ka->use_master_clock, gtod->clock.vclock_mode);
> +				atomic_read(&vcpu->kvm->online_vcpus),
> +				ka->use_master_clock);
>  #endif
>  }
>  
> @@ -1538,7 +1488,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
>  	kvm_vcpu_write_tsc_offset(vcpu, offset);
>  	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
>  
> -	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
> +	spin_lock(&kvm->arch.masterclock_lock);
>  	if (!matched) {
>  		kvm->arch.nr_vcpus_matched_tsc = 0;
>  	} else if (!already_matched) {
> @@ -1546,9 +1496,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
>  	}
>  
>  	kvm_track_tsc_matching(vcpu);
> -	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
> +	spin_unlock(&kvm->arch.masterclock_lock);
>  }
> -
>  EXPORT_SYMBOL_GPL(kvm_write_tsc);
>  
>  static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
> @@ -1567,79 +1516,6 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
>  
>  #ifdef CONFIG_X86_64
>  
> -static u64 read_tsc(void)
> -{
> -	u64 ret = (u64)rdtsc_ordered();
> -	u64 last = pvclock_gtod_data.clock.cycle_last;
> -
> -	if (likely(ret >= last))
> -		return ret;
> -
> -	/*
> -	 * GCC likes to generate cmov here, but this branch is extremely
> -	 * predictable (it's just a function of time and the likely is
> -	 * very likely) and there's a data dependence, so force GCC
> -	 * to generate a branch instead.  I don't barrier() because
> -	 * we don't actually need a barrier, and if this function
> -	 * ever gets inlined it will generate worse code.
> -	 */
> -	asm volatile ("");
> -	return last;
> -}
> -
> -static inline u64 vgettsc(u64 *cycle_now)
> -{
> -	long v;
> -	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> -
> -	*cycle_now = read_tsc();
> -
> -	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
> -	return v * gtod->clock.mult;
> -}
> -
> -static int do_monotonic_boot(s64 *t, u64 *cycle_now)
> -{
> -	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> -	unsigned long seq;
> -	int mode;
> -	u64 ns;
> -
> -	do {
> -		seq = read_seqcount_begin(&gtod->seq);
> -		mode = gtod->clock.vclock_mode;
> -		ns = gtod->nsec_base;
> -		ns += vgettsc(cycle_now);
> -		ns >>= gtod->clock.shift;
> -		ns += gtod->boot_ns;
> -	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
> -	*t = ns;
> -
> -	return mode;
> -}
> -
> -static int do_realtime(struct timespec *ts, u64 *cycle_now)
> -{
> -	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> -	unsigned long seq;
> -	int mode;
> -	u64 ns;
> -
> -	do {
> -		seq = read_seqcount_begin(&gtod->seq);
> -		mode = gtod->clock.vclock_mode;
> -		ts->tv_sec = gtod->wall_time_sec;
> -		ns = gtod->nsec_base;
> -		ns += vgettsc(cycle_now);
> -		ns >>= gtod->clock.shift;
> -	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
> -
> -	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
> -	ts->tv_nsec = ns;
> -
> -	return mode;
> -}
> -
>  /* returns true if host is using tsc clocksource */
>  static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
>  {
> @@ -1713,34 +1589,28 @@ static bool kvm_get_walltime_and_clockread(struct timespec *ts,
>   *
>   */
>  
> -static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
> +static void update_masterclock(struct kvm *kvm)
>  {
>  #ifdef CONFIG_X86_64
>  	struct kvm_arch *ka = &kvm->arch;
> -	int vclock_mode;
> -	bool host_tsc_clocksource, vcpus_matched;
> +	bool host_clocksource_stable, vcpus_matched;
>  
>  	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
>  			atomic_read(&kvm->online_vcpus));
>  
>  	/*
> -	 * If the host uses TSC clock, then passthrough TSC as stable
> -	 * to the guest.
> +	 * kvm_get_time_and_clockread returns true if clocksource is stable
>  	 */
> -	host_tsc_clocksource = kvm_get_time_and_clockread(
> +	host_clocksource_stable = kvm_get_time_and_clockread(
>  					&ka->master_kernel_ns,
>  					&ka->master_cycle_now);
>  
> -	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
> +	ka->use_master_clock = host_clocksource_stable && vcpus_matched
>  				&& !ka->backwards_tsc_observed
>  				&& !ka->boot_vcpu_runs_old_kvmclock;
>  
> -	if (ka->use_master_clock)
> -		atomic_set(&kvm_guest_has_master_clock, 1);
> -
> -	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
> -	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
> -					vcpus_matched);
> +	trace_kvm_update_master_clock(ka->use_master_clock,
> +				host_clocksource_stable, vcpus_matched);
>  #endif
>  }
>  
> @@ -1756,10 +1626,10 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  	struct kvm_vcpu *vcpu;
>  	struct kvm_arch *ka = &kvm->arch;
>  
> -	spin_lock(&ka->pvclock_gtod_sync_lock);
> +	spin_lock(&ka->masterclock_lock);
>  	kvm_make_mclock_inprogress_request(kvm);
>  	/* no guest entries from this point */
> -	pvclock_update_vm_gtod_copy(kvm);
> +	update_masterclock(kvm);
>  
>  	kvm_for_each_vcpu(i, vcpu, kvm)
>  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
> @@ -1768,7 +1638,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  	kvm_for_each_vcpu(i, vcpu, kvm)
>  		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
>  
> -	spin_unlock(&ka->pvclock_gtod_sync_lock);
> +	spin_unlock(&ka->masterclock_lock);
>  #endif
>  }
>  
> @@ -1778,15 +1648,15 @@ u64 get_kvmclock_ns(struct kvm *kvm)
>  	struct pvclock_vcpu_time_info hv_clock;
>  	u64 ret;
>  
> -	spin_lock(&ka->pvclock_gtod_sync_lock);
> +	spin_lock(&ka->masterclock_lock);
>  	if (!ka->use_master_clock) {
> -		spin_unlock(&ka->pvclock_gtod_sync_lock);
> +		spin_unlock(&ka->masterclock_lock);
>  		return ktime_get_boot_ns() + ka->kvmclock_offset;
>  	}
>  
>  	hv_clock.tsc_timestamp = ka->master_cycle_now;
>  	hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
> -	spin_unlock(&ka->pvclock_gtod_sync_lock);
> +	spin_unlock(&ka->masterclock_lock);
>  
>  	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
>  	get_cpu();
> @@ -1872,13 +1742,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>  	 * If the host uses TSC clock, then passthrough TSC as stable
>  	 * to the guest.
>  	 */
> -	spin_lock(&ka->pvclock_gtod_sync_lock);
> +	spin_lock(&ka->masterclock_lock);
>  	use_master_clock = ka->use_master_clock;
>  	if (use_master_clock) {
>  		host_tsc = ka->master_cycle_now;
>  		kernel_ns = ka->master_kernel_ns;
>  	}
> -	spin_unlock(&ka->pvclock_gtod_sync_lock);
> +	spin_unlock(&ka->masterclock_lock);
>  
>  	/* Keep irq disabled to prevent changes to the clock */
>  	local_irq_save(flags);
> @@ -4208,11 +4078,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  			goto out;
>  
>  		r = 0;
> -		/*
> -		 * TODO: userspace has to take care of races with VCPU_RUN, so
> -		 * kvm_gen_update_masterclock() can be cut down to locked
> -		 * pvclock_update_vm_gtod_copy().
> -		 */

I have no idea what race is this.. do you know?

> +
>  		kvm_gen_update_masterclock(kvm);
>  		now_ns = get_kvmclock_ns(kvm);
>  		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
> @@ -6041,7 +5907,8 @@ static void kvm_set_mmio_spte_mask(void)
>  }
>  
>  #ifdef CONFIG_X86_64
> -static void pvclock_gtod_update_fn(struct work_struct *work)
> +static int process_clocksource_change(struct notifier_block *nb,
> +					unsigned long unused0, void *unused1)
>  {
>  	struct kvm *kvm;
>  
> @@ -6052,35 +5919,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
>  	list_for_each_entry(kvm, &vm_list, vm_list)
>  		kvm_for_each_vcpu(i, vcpu, kvm)
>  			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> -	atomic_set(&kvm_guest_has_master_clock, 0);
>  	spin_unlock(&kvm_lock);
> -}
> -
> -static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
> -
> -/*
> - * Notification about pvclock gtod data update.
> - */
> -static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
> -			       void *priv)
> -{
> -	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
> -	struct timekeeper *tk = priv;
> -
> -	update_pvclock_gtod(tk);
> -
> -	/* disable master clock if host does not trust, or does not
> -	 * use, TSC clocksource
> -	 */
> -	if (gtod->clock.vclock_mode != VCLOCK_TSC &&
> -	    atomic_read(&kvm_guest_has_master_clock) != 0)
> -		queue_work(system_long_wq, &pvclock_gtod_work);

Don't drop this: TSC is required, and switching to another
clock must disable masterclock scheme.

> -
>  	return 0;
>  }
>  
> -static struct notifier_block pvclock_gtod_notifier = {
> -	.notifier_call = pvclock_gtod_notify,
> +static struct notifier_block clocksource_change_notifier = {
> +	.notifier_call = process_clocksource_change,
>  };
>  #endif
>  
> @@ -6133,7 +5977,7 @@ int kvm_arch_init(void *opaque)
>  
>  	kvm_lapic_init();
>  #ifdef CONFIG_X86_64
> -	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
> +	clocksource_changes_register_notifier(&clocksource_change_notifier);
>  #endif
>  
>  	return 0;
> @@ -6154,7 +5998,7 @@ void kvm_arch_exit(void)
>  					    CPUFREQ_TRANSITION_NOTIFIER);
>  	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
>  #ifdef CONFIG_X86_64
> -	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
> +	clocksource_changes_unregister_notifier(&clocksource_change_notifier);
>  #endif
>  	kvm_x86_ops = NULL;
>  	kvm_mmu_module_exit();
> @@ -8056,10 +7900,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>  	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
>  	mutex_init(&kvm->arch.apic_map_lock);
>  	mutex_init(&kvm->arch.hyperv.hv_lock);
> -	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
> +	spin_lock_init(&kvm->arch.masterclock_lock);
>  
>  	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
> -	pvclock_update_vm_gtod_copy(kvm);
> +	update_masterclock(kvm);
>  
>  	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
>  	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
> -- 
> 2.7.4