Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns

Paolo Bonzini <pbonzini@xxxxxxxxxx> · Fri, 2 Sep 2016 16:09:42 +0200

On 02/09/2016 15:52, Roman Kagan wrote:
> On Thu, Sep 01, 2016 at 05:26:14PM +0200, Paolo Bonzini wrote:
>> Introduce a function that reads the exact nanoseconds value that is
>> provided to the guest in kvmclock.  This crystallizes the notion of
>> kvmclock as a thin veneer over a stable TSC, that the guest will
>> (hopefully) convert with NTP.  In other words, kvmclock is *not* a
>> paravirtualized host-to-guest NTP.
>>
>> Drop the get_kernel_ns() function, that was used both to get the base
>> value of the master clock and to get the current value of kvmclock.
>> The former use is replaced by ktime_get_boot_ns(), the latter is
>> the purpose of get_kernel_ns().
>>
>> This also allows KVM to provide a Hyper-V time reference counter that
>> is synchronized with the time that is computed from the TSC page.
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
>> ---
>>  arch/x86/entry/vdso/vclock_gettime.c |  2 +-
>>  arch/x86/include/asm/pvclock.h       |  5 ++--
>>  arch/x86/kernel/pvclock.c            |  2 +-
>>  arch/x86/kvm/hyperv.c                |  2 +-
>>  arch/x86/kvm/x86.c                   | 48 +++++++++++++++++++++++++++---------
>>  arch/x86/kvm/x86.h                   |  6 +----
>>  6 files changed, 43 insertions(+), 22 deletions(-)
>>
>> diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
>> index 94d54d0defa7..02223cb4bcfd 100644
>> --- a/arch/x86/entry/vdso/vclock_gettime.c
>> +++ b/arch/x86/entry/vdso/vclock_gettime.c
>> @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
>>  			return 0;
>>  		}
>>  
>> -		ret = __pvclock_read_cycles(pvti);
>> +		ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
>>  	} while (pvclock_read_retry(pvti, version));
>>  
>>  	/* refer to vread_tsc() comment for rationale */
>> diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
>> index d019f0cc80ec..3ad741b84072 100644
>> --- a/arch/x86/include/asm/pvclock.h
>> +++ b/arch/x86/include/asm/pvclock.h
>> @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
>>  }
>>  
>>  static __always_inline
>> -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
>> +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
>> +			      u64 tsc)
>>  {
>> -	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
>> +	u64 delta = tsc - src->tsc_timestamp;
>>  	cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
>>  					     src->tsc_shift);
>>  	return src->system_time + offset;
>> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
>> index 3599404e3089..5b2cc889ce34 100644
>> --- a/arch/x86/kernel/pvclock.c
>> +++ b/arch/x86/kernel/pvclock.c
>> @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
>>  
>>  	do {
>>  		version = pvclock_read_begin(src);
>> -		ret = __pvclock_read_cycles(src);
>> +		ret = __pvclock_read_cycles(src, rdtsc_ordered());
>>  		flags = src->flags;
>>  	} while (pvclock_read_retry(src, version));
>>  
> 
> Perhaps adding an argument to __pvclock_read_cycles deserves a separate
> patch, to get timekeeping folks' ack on it?
> 
>> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
>> index 01bd7b7a6866..ed5b77f39ffb 100644
>> --- a/arch/x86/kvm/hyperv.c
>> +++ b/arch/x86/kvm/hyperv.c
>> @@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
>>  
>>  static u64 get_time_ref_counter(struct kvm *kvm)
>>  {
>> -	return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
>> +	return div_u64(get_kvmclock_ns(kvm), 100);
> 
> Since this does slightly different calculation than the real hyperv tsc
> ref page clock is supposed to, I wonder if we are safe WRT precision
> errors leading to occasional monotonicity violations?

The Hyper-V scale is

     tsc_to_system_mul * 2^(32+tsc_shift) / 100

and the only source of error could be from doing here

     (tsc * tsc_to_system_mul >> (32-tsc_shift)) / 100

vs

     tsc * ((tsc_to_system_mul >> (32-tsc_shift)) / 100))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                 this is scale / 2^64

in the TSC ref page clock.  If my reasoning is correct the error will be
at most 100 units of the scale value, which is a relative error of
around 1 parts per 2^49.

Likewise for the offset, the improvement from

    (tsc - base_tsc) * tsc_to_system_mul >> (32-tsc_shift)
             + base_system_time

vs. using a single offset as in the TSC ref page is one nanosecond---and
the ref page only has a resolution of 100 ns.

Paolo

>>  }
>>  
>>  static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index b5853b86b67d..2edcfa054cbe 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1425,7 +1425,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
>>  
>>  	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
>>  	offset = kvm_compute_tsc_offset(vcpu, data);
>> -	ns = get_kernel_ns();
>> +	ns = ktime_get_boot_ns();
>>  	elapsed = ns - kvm->arch.last_tsc_nsec;
>>  
>>  	if (vcpu->arch.virtual_tsc_khz) {
>> @@ -1716,6 +1716,34 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>>  #endif
>>  }
>>  
>> +static u64 __get_kvmclock_ns(struct kvm *kvm)
>> +{
>> +	struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
>> +	struct kvm_arch *ka = &kvm->arch;
>> +	s64 ns;
>> +
>> +	if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
>> +		u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
>> +		ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
>> +	} else {
>> +		ns = ktime_get_boot_ns() + ka->kvmclock_offset;
>> +	}
>> +
>> +	return ns;
>> +}
>> +
>> +u64 get_kvmclock_ns(struct kvm *kvm)
>> +{
>> +	unsigned long flags;
>> +	s64 ns;
>> +
>> +	local_irq_save(flags);
>> +	ns = __get_kvmclock_ns(kvm);
>> +	local_irq_restore(flags);
>> +
>> +	return ns;
>> +}
>> +
>>  static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
>>  {
>>  	struct kvm_vcpu_arch *vcpu = &v->arch;
>> @@ -1805,7 +1833,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>>  	}
>>  	if (!use_master_clock) {
>>  		host_tsc = rdtsc();
>> -		kernel_ns = get_kernel_ns();
>> +		kernel_ns = ktime_get_boot_ns();
>>  	}
>>  
>>  	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
>> @@ -4048,7 +4076,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
>>  	case KVM_SET_CLOCK: {
>>  		struct kvm_clock_data user_ns;
>>  		u64 now_ns;
>> -		s64 delta;
>>  
>>  		r = -EFAULT;
>>  		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
>> @@ -4060,10 +4087,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
>>  
>>  		r = 0;
>>  		local_irq_disable();
>> -		now_ns = get_kernel_ns();
>> -		delta = user_ns.clock - now_ns;
>> +		now_ns = __get_kvmclock_ns(kvm);
>> +		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
>>  		local_irq_enable();
>> -		kvm->arch.kvmclock_offset = delta;
>>  		kvm_gen_update_masterclock(kvm);
>>  		break;
>>  	}
> 
> I'm curious why explicit irq disable/enable is left here, unlike the
> next hunk where it's moved into get_kvmclock_ns?
> 
>> @@ -4071,10 +4097,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
>>  		struct kvm_clock_data user_ns;
>>  		u64 now_ns;
>>  
>> -		local_irq_disable();
>> -		now_ns = get_kernel_ns();
>> -		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
>> -		local_irq_enable();
>> +		now_ns = get_kvmclock_ns(kvm);
>> +		user_ns.clock = now_ns;
> 
> Nitpick: now_ns is even less useful now than it was before the patch.
> 
>>  		user_ns.flags = 0;
>>  		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
>>  
>> @@ -7539,7 +7563,7 @@ int kvm_arch_hardware_enable(void)
>>  	 * before any KVM threads can be running.  Unfortunately, we can't
>>  	 * bring the TSCs fully up to date with real time, as we aren't yet far
>>  	 * enough into CPU bringup that we know how much real time has actually
>> -	 * elapsed; our helper function, get_kernel_ns() will be using boot
>> +	 * elapsed; our helper function, ktime_get_boot_ns() will be using boot
>>  	 * variables that haven't been updated yet.
>>  	 *
>>  	 * So we simply find the maximum observed TSC above, then record the
>> @@ -7774,7 +7798,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>>  	mutex_init(&kvm->arch.apic_map_lock);
>>  	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
>>  
>> -	kvm->arch.kvmclock_offset = -get_kernel_ns();
>> +	kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
>>  	pvclock_update_vm_gtod_copy(kvm);
>>  
>>  	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>> index a82ca466b62e..e8ff3e4ce38a 100644
>> --- a/arch/x86/kvm/x86.h
>> +++ b/arch/x86/kvm/x86.h
>> @@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
>>  	return kvm_register_write(vcpu, reg, val);
>>  }
>>  
>> -static inline u64 get_kernel_ns(void)
>> -{
>> -	return ktime_get_boot_ns();
>> -}
>> -
>>  static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
>>  {
>>  	return !(kvm->arch.disabled_quirks & quirk);
>> @@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
>>  int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
>>  
>>  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
>> +u64 get_kvmclock_ns(struct kvm *kvm);
>>  
>>  int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
>>  	gva_t addr, void *val, unsigned int bytes,
>> -- 
>> 1.8.3.1
>>
>>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html