On 02/09/2016 15:52, Roman Kagan wrote: > On Thu, Sep 01, 2016 at 05:26:14PM +0200, Paolo Bonzini wrote: >> Introduce a function that reads the exact nanoseconds value that is >> provided to the guest in kvmclock. This crystallizes the notion of >> kvmclock as a thin veneer over a stable TSC, that the guest will >> (hopefully) convert with NTP. In other words, kvmclock is *not* a >> paravirtualized host-to-guest NTP. >> >> Drop the get_kernel_ns() function, that was used both to get the base >> value of the master clock and to get the current value of kvmclock. >> The former use is replaced by ktime_get_boot_ns(), the latter is >> the purpose of get_kernel_ns(). >> >> This also allows KVM to provide a Hyper-V time reference counter that >> is synchronized with the time that is computed from the TSC page. >> >> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> >> --- >> arch/x86/entry/vdso/vclock_gettime.c | 2 +- >> arch/x86/include/asm/pvclock.h | 5 ++-- >> arch/x86/kernel/pvclock.c | 2 +- >> arch/x86/kvm/hyperv.c | 2 +- >> arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++--------- >> arch/x86/kvm/x86.h | 6 +---- >> 6 files changed, 43 insertions(+), 22 deletions(-) >> >> diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c >> index 94d54d0defa7..02223cb4bcfd 100644 >> --- a/arch/x86/entry/vdso/vclock_gettime.c >> +++ b/arch/x86/entry/vdso/vclock_gettime.c >> @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode) >> return 0; >> } >> >> - ret = __pvclock_read_cycles(pvti); >> + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); >> } while (pvclock_read_retry(pvti, version)); >> >> /* refer to vread_tsc() comment for rationale */ >> diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h >> index d019f0cc80ec..3ad741b84072 100644 >> --- a/arch/x86/include/asm/pvclock.h >> +++ b/arch/x86/include/asm/pvclock.h >> @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) >> } >> >> static __always_inline >> -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src) >> +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, >> + u64 tsc) >> { >> - u64 delta = rdtsc_ordered() - src->tsc_timestamp; >> + u64 delta = tsc - src->tsc_timestamp; >> cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul, >> src->tsc_shift); >> return src->system_time + offset; >> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c >> index 3599404e3089..5b2cc889ce34 100644 >> --- a/arch/x86/kernel/pvclock.c >> +++ b/arch/x86/kernel/pvclock.c >> @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) >> >> do { >> version = pvclock_read_begin(src); >> - ret = __pvclock_read_cycles(src); >> + ret = __pvclock_read_cycles(src, rdtsc_ordered()); >> flags = src->flags; >> } while (pvclock_read_retry(src, version)); >> > > Perhaps adding an argument to __pvclock_read_cycles deserves a separate > patch, to get timekeeping folks' ack on it? > >> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c >> index 01bd7b7a6866..ed5b77f39ffb 100644 >> --- a/arch/x86/kvm/hyperv.c >> +++ b/arch/x86/kvm/hyperv.c >> @@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic) >> >> static u64 get_time_ref_counter(struct kvm *kvm) >> { >> - return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100); >> + return div_u64(get_kvmclock_ns(kvm), 100); > > Since this does slightly different calculation than the real hyperv tsc > ref page clock is supposed to, I wonder if we are safe WRT precision > errors leading to occasional monotonicity violations? The Hyper-V scale is tsc_to_system_mul * 2^(32+tsc_shift) / 100 and the only source of error could be from doing here (tsc * tsc_to_system_mul >> (32-tsc_shift)) / 100 vs tsc * ((tsc_to_system_mul >> (32-tsc_shift)) / 100)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ this is scale / 2^64 in the TSC ref page clock. If my reasoning is correct the error will be at most 100 units of the scale value, which is a relative error of around 1 parts per 2^49. Likewise for the offset, the improvement from (tsc - base_tsc) * tsc_to_system_mul >> (32-tsc_shift) + base_system_time vs. using a single offset as in the TSC ref page is one nanosecond---and the ref page only has a resolution of 100 ns. Paolo >> } >> >> static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer, >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index b5853b86b67d..2edcfa054cbe 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -1425,7 +1425,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) >> >> raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); >> offset = kvm_compute_tsc_offset(vcpu, data); >> - ns = get_kernel_ns(); >> + ns = ktime_get_boot_ns(); >> elapsed = ns - kvm->arch.last_tsc_nsec; >> >> if (vcpu->arch.virtual_tsc_khz) { >> @@ -1716,6 +1716,34 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) >> #endif >> } >> >> +static u64 __get_kvmclock_ns(struct kvm *kvm) >> +{ >> + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0); >> + struct kvm_arch *ka = &kvm->arch; >> + s64 ns; >> + >> + if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) { >> + u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc()); >> + ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc); >> + } else { >> + ns = ktime_get_boot_ns() + ka->kvmclock_offset; >> + } >> + >> + return ns; >> +} >> + >> +u64 get_kvmclock_ns(struct kvm *kvm) >> +{ >> + unsigned long flags; >> + s64 ns; >> + >> + local_irq_save(flags); >> + ns = __get_kvmclock_ns(kvm); >> + local_irq_restore(flags); >> + >> + return ns; >> +} >> + >> static void kvm_setup_pvclock_page(struct kvm_vcpu *v) >> { >> struct kvm_vcpu_arch *vcpu = &v->arch; >> @@ -1805,7 +1833,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) >> } >> if (!use_master_clock) { >> host_tsc = rdtsc(); >> - kernel_ns = get_kernel_ns(); >> + kernel_ns = ktime_get_boot_ns(); >> } >> >> tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); >> @@ -4048,7 +4076,6 @@ long kvm_arch_vm_ioctl(struct file *filp, >> case KVM_SET_CLOCK: { >> struct kvm_clock_data user_ns; >> u64 now_ns; >> - s64 delta; >> >> r = -EFAULT; >> if (copy_from_user(&user_ns, argp, sizeof(user_ns))) >> @@ -4060,10 +4087,9 @@ long kvm_arch_vm_ioctl(struct file *filp, >> >> r = 0; >> local_irq_disable(); >> - now_ns = get_kernel_ns(); >> - delta = user_ns.clock - now_ns; >> + now_ns = __get_kvmclock_ns(kvm); >> + kvm->arch.kvmclock_offset += user_ns.clock - now_ns; >> local_irq_enable(); >> - kvm->arch.kvmclock_offset = delta; >> kvm_gen_update_masterclock(kvm); >> break; >> } > > I'm curious why explicit irq disable/enable is left here, unlike the > next hunk where it's moved into get_kvmclock_ns? > >> @@ -4071,10 +4097,8 @@ long kvm_arch_vm_ioctl(struct file *filp, >> struct kvm_clock_data user_ns; >> u64 now_ns; >> >> - local_irq_disable(); >> - now_ns = get_kernel_ns(); >> - user_ns.clock = kvm->arch.kvmclock_offset + now_ns; >> - local_irq_enable(); >> + now_ns = get_kvmclock_ns(kvm); >> + user_ns.clock = now_ns; > > Nitpick: now_ns is even less useful now than it was before the patch. > >> user_ns.flags = 0; >> memset(&user_ns.pad, 0, sizeof(user_ns.pad)); >> >> @@ -7539,7 +7563,7 @@ int kvm_arch_hardware_enable(void) >> * before any KVM threads can be running. Unfortunately, we can't >> * bring the TSCs fully up to date with real time, as we aren't yet far >> * enough into CPU bringup that we know how much real time has actually >> - * elapsed; our helper function, get_kernel_ns() will be using boot >> + * elapsed; our helper function, ktime_get_boot_ns() will be using boot >> * variables that haven't been updated yet. >> * >> * So we simply find the maximum observed TSC above, then record the >> @@ -7774,7 +7798,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) >> mutex_init(&kvm->arch.apic_map_lock); >> spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); >> >> - kvm->arch.kvmclock_offset = -get_kernel_ns(); >> + kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); >> pvclock_update_vm_gtod_copy(kvm); >> >> INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); >> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h >> index a82ca466b62e..e8ff3e4ce38a 100644 >> --- a/arch/x86/kvm/x86.h >> +++ b/arch/x86/kvm/x86.h >> @@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu, >> return kvm_register_write(vcpu, reg, val); >> } >> >> -static inline u64 get_kernel_ns(void) >> -{ >> - return ktime_get_boot_ns(); >> -} >> - >> static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) >> { >> return !(kvm->arch.disabled_quirks & quirk); >> @@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu); >> int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); >> >> void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); >> +u64 get_kvmclock_ns(struct kvm *kvm); >> >> int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, >> gva_t addr, void *val, unsigned int bytes, >> -- >> 1.8.3.1 >> >> -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html