On Thu, Sep 01, 2016 at 05:26:14PM +0200, Paolo Bonzini wrote: > Introduce a function that reads the exact nanoseconds value that is > provided to the guest in kvmclock. This crystallizes the notion of > kvmclock as a thin veneer over a stable TSC, that the guest will > (hopefully) convert with NTP. In other words, kvmclock is *not* a > paravirtualized host-to-guest NTP. > > Drop the get_kernel_ns() function, that was used both to get the base > value of the master clock and to get the current value of kvmclock. > The former use is replaced by ktime_get_boot_ns(), the latter is > the purpose of get_kernel_ns(). > > This also allows KVM to provide a Hyper-V time reference counter that > is synchronized with the time that is computed from the TSC page. > > Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > --- > arch/x86/entry/vdso/vclock_gettime.c | 2 +- > arch/x86/include/asm/pvclock.h | 5 ++-- > arch/x86/kernel/pvclock.c | 2 +- > arch/x86/kvm/hyperv.c | 2 +- > arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++--------- > arch/x86/kvm/x86.h | 6 +---- > 6 files changed, 43 insertions(+), 22 deletions(-) > > diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c > index 94d54d0defa7..02223cb4bcfd 100644 > --- a/arch/x86/entry/vdso/vclock_gettime.c > +++ b/arch/x86/entry/vdso/vclock_gettime.c > @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode) > return 0; > } > > - ret = __pvclock_read_cycles(pvti); > + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); > } while (pvclock_read_retry(pvti, version)); > > /* refer to vread_tsc() comment for rationale */ > diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h > index d019f0cc80ec..3ad741b84072 100644 > --- a/arch/x86/include/asm/pvclock.h > +++ b/arch/x86/include/asm/pvclock.h > @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) > } > > static __always_inline > -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) > +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, > + u64 tsc) > { > - u64 delta = rdtsc_ordered() - src->tsc_timestamp; > + u64 delta = tsc - src->tsc_timestamp; > cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, > src->tsc_shift); > return src->system_time + offset; > diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c > index 3599404e3089..5b2cc889ce34 100644 > --- a/arch/x86/kernel/pvclock.c > +++ b/arch/x86/kernel/pvclock.c > @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) > > do { > version = pvclock_read_begin(src); > - ret = __pvclock_read_cycles(src); > + ret = __pvclock_read_cycles(src, rdtsc_ordered()); > flags = src->flags; > } while (pvclock_read_retry(src, version)); > Perhaps adding an argument to __pvclock_read_cycles deserves a separate patch, to get timekeeping folks' ack on it? > diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c > index 01bd7b7a6866..ed5b77f39ffb 100644 > --- a/arch/x86/kvm/hyperv.c > +++ b/arch/x86/kvm/hyperv.c > @@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) > > static u64 get_time_ref_counter(struct kvm *kvm) > { > - return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); > + return div_u64(get_kvmclock_ns(kvm), 100); Since this does slightly different calculation than the real hyperv tsc ref page clock is supposed to, I wonder if we are safe WRT precision errors leading to occasional monotonicity violations? > } > > static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index b5853b86b67d..2edcfa054cbe 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1425,7 +1425,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) > > raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); > offset = kvm_compute_tsc_offset(vcpu, data); > - ns = get_kernel_ns(); > + ns = ktime_get_boot_ns(); > elapsed = ns - kvm->arch.last_tsc_nsec; > > if (vcpu->arch.virtual_tsc_khz) { > @@ -1716,6 +1716,34 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) > #endif > } > > +static u64 __get_kvmclock_ns(struct kvm *kvm) > +{ > + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0); > + struct kvm_arch *ka = &kvm->arch; > + s64 ns; > + > + if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) { > + u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc()); > + ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc); > + } else { > + ns = ktime_get_boot_ns() + ka->kvmclock_offset; > + } > + > + return ns; > +} > + > +u64 get_kvmclock_ns(struct kvm *kvm) > +{ > + unsigned long flags; > + s64 ns; > + > + local_irq_save(flags); > + ns = __get_kvmclock_ns(kvm); > + local_irq_restore(flags); > + > + return ns; > +} > + > static void kvm_setup_pvclock_page(struct kvm_vcpu *v) > { > struct kvm_vcpu_arch *vcpu = &v->arch; > @@ -1805,7 +1833,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) > } > if (!use_master_clock) { > host_tsc = rdtsc(); > - kernel_ns = get_kernel_ns(); > + kernel_ns = ktime_get_boot_ns(); > } > > tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); > @@ -4048,7 +4076,6 @@ long kvm_arch_vm_ioctl(struct file *filp, > case KVM_SET_CLOCK: { > struct kvm_clock_data user_ns; > u64 now_ns; > - s64 delta; > > r = -EFAULT; > if (copy_from_user(&user_ns, argp, sizeof(user_ns))) > @@ -4060,10 +4087,9 @@ long kvm_arch_vm_ioctl(struct file *filp, > > r = 0; > local_irq_disable(); > - now_ns = get_kernel_ns(); > - delta = user_ns.clock - now_ns; > + now_ns = __get_kvmclock_ns(kvm); > + kvm->arch.kvmclock_offset += user_ns.clock - now_ns; > local_irq_enable(); > - kvm->arch.kvmclock_offset = delta; > kvm_gen_update_masterclock(kvm); > break; > } I'm curious why explicit irq disable/enable is left here, unlike the next hunk where it's moved into get_kvmclock_ns? > @@ -4071,10 +4097,8 @@ long kvm_arch_vm_ioctl(struct file *filp, > struct kvm_clock_data user_ns; > u64 now_ns; > > - local_irq_disable(); > - now_ns = get_kernel_ns(); > - user_ns.clock = kvm->arch.kvmclock_offset + now_ns; > - local_irq_enable(); > + now_ns = get_kvmclock_ns(kvm); > + user_ns.clock = now_ns; Nitpick: now_ns is even less useful now than it was before the patch. > user_ns.flags = 0; > memset(&user_ns.pad, 0, sizeof(user_ns.pad)); > > @@ -7539,7 +7563,7 @@ int kvm_arch_hardware_enable(void) > * before any KVM threads can be running. Unfortunately, we can't > * bring the TSCs fully up to date with real time, as we aren't yet far > * enough into CPU bringup that we know how much real time has actually > - * elapsed; our helper function, get_kernel_ns() will be using boot > + * elapsed; our helper function, ktime_get_boot_ns() will be using boot > * variables that haven't been updated yet. > * > * So we simply find the maximum observed TSC above, then record the > @@ -7774,7 +7798,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) > mutex_init(&kvm->arch.apic_map_lock); > spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); > > - kvm->arch.kvmclock_offset = -get_kernel_ns(); > + kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); > pvclock_update_vm_gtod_copy(kvm); > > INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index a82ca466b62e..e8ff3e4ce38a 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, > return kvm_register_write(vcpu, reg, val); > } > > -static inline u64 get_kernel_ns(void) > -{ > - return ktime_get_boot_ns(); > -} > - > static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) > { > return !(kvm->arch.disabled_quirks & quirk); > @@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu); > int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); > > void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); > +u64 get_kvmclock_ns(struct kvm *kvm); > > int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, > gva_t addr, void *val, unsigned int bytes, > -- > 1.8.3.1 > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html