On Fri, Sep 18, 2015 at 05:54:29PM +0200, Radim Krčmář wrote: > Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore. > The purpose of that flags was to start counting system time from 0 when > the KVM clock has been initialized. > We can achieve the same by selecting one read as the initial point. > > A simple subtraction will work unless the KVM clock count overflows > earlier (has smaller width) than scheduler's cycle count. We should be > safe till x86_128. > > Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors, > setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might > regress on older ones. > > I presume we don't need to change kvm_clock_read instead of introducing > kvm_sched_clock_read. A problem could arise in case sched_clock is > expected to return the same value as get_cycles, but we should have > merged those clocks in that case. > > Signed-off-by: Radim Krčmář <rkrcmar@xxxxxxxxxx> > --- > arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++++++++----------- > 1 file changed, 35 insertions(+), 11 deletions(-) > > diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c > index 2c7aafa70702..ef5b3d2cecce 100644 > --- a/arch/x86/kernel/kvmclock.c > +++ b/arch/x86/kernel/kvmclock.c > @@ -32,6 +32,7 @@ > static int kvmclock = 1; > static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; > static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; > +static cycle_t kvm_sched_clock_offset; > > static int parse_no_kvmclock(char *arg) > { > @@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs) > return kvm_clock_read(); > } > > +static cycle_t kvm_sched_clock_read(void) > +{ > + return kvm_clock_read() - kvm_sched_clock_offset; > +} > + > +static inline void kvm_sched_clock_init(bool stable) > +{ > + if (!stable) { > + pv_time_ops.sched_clock = kvm_clock_read; > + return; > + } > + > + kvm_sched_clock_offset = kvm_clock_read(); > + pv_time_ops.sched_clock = kvm_sched_clock_read; > + set_sched_clock_stable(); > + > + printk("kvm-clock: using sched offset of %llu cycles\n", > + kvm_sched_clock_offset); > + > + BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > > + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); > +} > + > /* > * If we don't do that, there is the possibility that the guest > * will calibrate under heavy load - thus, getting a lower lpj - > @@ -248,7 +272,17 @@ void __init kvmclock_init(void) > memblock_free(mem, size); > return; > } > - pv_time_ops.sched_clock = kvm_clock_read; > + > + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) > + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); > + > + cpu = get_cpu(); > + vcpu_time = &hv_clock[cpu].pvti; > + flags = pvclock_read_flags(vcpu_time); > + > + kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); > + put_cpu(); > + > x86_platform.calibrate_tsc = kvm_get_tsc_khz; > x86_platform.get_wallclock = kvm_get_wallclock; > x86_platform.set_wallclock = kvm_set_wallclock; > @@ -265,16 +299,6 @@ void __init kvmclock_init(void) > kvm_get_preset_lpj(); > clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); > pv_info.name = "KVM"; > - > - if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) > - pvclock_set_flags(~0); > - > - cpu = get_cpu(); > - vcpu_time = &hv_clock[cpu].pvti; > - flags = pvclock_read_flags(vcpu_time); > - if (flags & PVCLOCK_COUNTS_FROM_ZERO) > - set_sched_clock_stable(); > - put_cpu(); > } > > int __init kvm_setup_vsyscall_timeinfo(void) > -- > 2.5.2 ACK -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html