Hi Denis, I'm all for this as well, the original submission suggested something similar, someone said "use a scheme similar to vsyscalls", therefore the internal copy of the fields. More comments below. On Wed, Aug 02, 2017 at 05:38:07PM +0300, Denis Plotnikov wrote: > Since, KVM has been switched to getting masterclock related data > right from the timekeeper by the previous patches, now we are able > to remove all the parts related to the old scheme of getting > masterclock data. > > This patch removes those parts. > > Signed-off-by: Denis Plotnikov <dplotnikov@xxxxxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 2 +- > arch/x86/kvm/trace.h | 31 ++---- > arch/x86/kvm/x86.c | 216 ++++++---------------------------------- > 3 files changed, 42 insertions(+), 207 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 87ac4fb..91465db 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -791,7 +791,7 @@ struct kvm_arch { > u64 cur_tsc_generation; > int nr_vcpus_matched_tsc; > > - spinlock_t pvclock_gtod_sync_lock; > + spinlock_t masterclock_lock; > bool use_master_clock; > u64 master_kernel_ns; > u64 master_cycle_now; > diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h > index 0a6cc67..923ab31 100644 > --- a/arch/x86/kvm/trace.h > +++ b/arch/x86/kvm/trace.h > @@ -807,45 +807,39 @@ TRACE_EVENT(kvm_write_tsc_offset, > > #ifdef CONFIG_X86_64 > > -#define host_clocks \ > - {VCLOCK_NONE, "none"}, \ > - {VCLOCK_TSC, "tsc"} \ > - > TRACE_EVENT(kvm_update_master_clock, > - TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), > - TP_ARGS(use_master_clock, host_clock, offset_matched), > + TP_PROTO(bool use_master_clock, bool host_clock_stable, > + bool offset_matched), > + TP_ARGS(use_master_clock, host_clock_stable, offset_matched), > > TP_STRUCT__entry( > __field( bool, use_master_clock ) > - __field( unsigned int, host_clock ) > + __field( bool, host_clock_stable ) > __field( bool, offset_matched ) > ), > > TP_fast_assign( > __entry->use_master_clock = use_master_clock; > - __entry->host_clock = host_clock; > + __entry->host_clock_stable = host_clock_stable; > __entry->offset_matched = offset_matched; > ), > > - TP_printk("masterclock %d hostclock %s offsetmatched %u", > + TP_printk("masterclock %d hostclock stable %u offsetmatched %u", > __entry->use_master_clock, > - __print_symbolic(__entry->host_clock, host_clocks), > + __entry->host_clock_stable, > __entry->offset_matched) > ); > > TRACE_EVENT(kvm_track_tsc, > TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, > - unsigned int online_vcpus, bool use_master_clock, > - unsigned int host_clock), > - TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, > - host_clock), > + unsigned int online_vcpus, bool use_master_clock), > + TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock), > > TP_STRUCT__entry( > __field( unsigned int, vcpu_id ) > __field( unsigned int, nr_vcpus_matched_tsc ) > __field( unsigned int, online_vcpus ) > __field( bool, use_master_clock ) > - __field( unsigned int, host_clock ) > ), > > TP_fast_assign( > @@ -853,14 +847,11 @@ TRACE_EVENT(kvm_track_tsc, > __entry->nr_vcpus_matched_tsc = nr_matched; > __entry->online_vcpus = online_vcpus; > __entry->use_master_clock = use_master_clock; > - __entry->host_clock = host_clock; > ), > > - TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" > - " hostclock %s", > + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u", > __entry->vcpu_id, __entry->use_master_clock, > - __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, > - __print_symbolic(__entry->host_clock, host_clocks)) > + __entry->nr_vcpus_matched_tsc, __entry->online_vcpus) > ); > > #endif /* CONFIG_X86_64 */ > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index d8ec2ca..53754fa 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -50,7 +50,7 @@ > #include <linux/hash.h> > #include <linux/pci.h> > #include <linux/timekeeper_internal.h> > -#include <linux/pvclock_gtod.h> > +#include <linux/cs_notifier.h> > #include <linux/kvm_irqfd.h> > #include <linux/irqbypass.h> > #include <linux/sched/stat.h> > @@ -1134,50 +1134,6 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) > return kvm_set_msr(vcpu, &msr); > } > > -#ifdef CONFIG_X86_64 > -struct pvclock_gtod_data { > - seqcount_t seq; > - > - struct { /* extract of a clocksource struct */ > - int vclock_mode; > - u64 cycle_last; > - u64 mask; > - u32 mult; > - u32 shift; > - } clock; > - > - u64 boot_ns; > - u64 nsec_base; > - u64 wall_time_sec; > -}; > - > -static struct pvclock_gtod_data pvclock_gtod_data; > - > -static void update_pvclock_gtod(struct timekeeper *tk) > -{ > - struct pvclock_gtod_data *vdata = &pvclock_gtod_data; > - u64 boot_ns; > - > - boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); > - > - write_seqcount_begin(&vdata->seq); > - > - /* copy pvclock gtod data */ > - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; > - vdata->clock.cycle_last = tk->tkr_mono.cycle_last; > - vdata->clock.mask = tk->tkr_mono.mask; > - vdata->clock.mult = tk->tkr_mono.mult; > - vdata->clock.shift = tk->tkr_mono.shift; > - > - vdata->boot_ns = boot_ns; > - vdata->nsec_base = tk->tkr_mono.xtime_nsec; > - > - vdata->wall_time_sec = tk->xtime_sec; > - > - write_seqcount_end(&vdata->seq); > -} > -#endif > - > void kvm_set_pending_timer(struct kvm_vcpu *vcpu) > { > /* > @@ -1269,10 +1225,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, > __func__, base_hz, scaled_hz, shift, *pmultiplier); > } > > -#ifdef CONFIG_X86_64 > -static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); > -#endif > - > static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); > static unsigned long max_tsc_khz; > > @@ -1366,7 +1318,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) > #ifdef CONFIG_X86_64 > bool vcpus_matched; > struct kvm_arch *ka = &vcpu->kvm->arch; > - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; > > vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == > atomic_read(&vcpu->kvm->online_vcpus)); > @@ -1379,13 +1330,12 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) > * and the vcpus need to have matched TSCs. When that happens, > * perform request to enable masterclock. > */ > - if (ka->use_master_clock || > - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) > + if (ka->use_master_clock || vcpus_matched) > kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); Don't drop this. The masterclock scheme requires TSC for proper functioning (or an analysis why its supposed with different HPET+TSC, for example). > > trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, > - atomic_read(&vcpu->kvm->online_vcpus), > - ka->use_master_clock, gtod->clock.vclock_mode); > + atomic_read(&vcpu->kvm->online_vcpus), > + ka->use_master_clock); > #endif > } > > @@ -1538,7 +1488,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) > kvm_vcpu_write_tsc_offset(vcpu, offset); > raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); > > - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); > + spin_lock(&kvm->arch.masterclock_lock); > if (!matched) { > kvm->arch.nr_vcpus_matched_tsc = 0; > } else if (!already_matched) { > @@ -1546,9 +1496,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) > } > > kvm_track_tsc_matching(vcpu); > - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); > + spin_unlock(&kvm->arch.masterclock_lock); > } > - > EXPORT_SYMBOL_GPL(kvm_write_tsc); > > static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, > @@ -1567,79 +1516,6 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) > > #ifdef CONFIG_X86_64 > > -static u64 read_tsc(void) > -{ > - u64 ret = (u64)rdtsc_ordered(); > - u64 last = pvclock_gtod_data.clock.cycle_last; > - > - if (likely(ret >= last)) > - return ret; > - > - /* > - * GCC likes to generate cmov here, but this branch is extremely > - * predictable (it's just a function of time and the likely is > - * very likely) and there's a data dependence, so force GCC > - * to generate a branch instead. I don't barrier() because > - * we don't actually need a barrier, and if this function > - * ever gets inlined it will generate worse code. > - */ > - asm volatile (""); > - return last; > -} > - > -static inline u64 vgettsc(u64 *cycle_now) > -{ > - long v; > - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; > - > - *cycle_now = read_tsc(); > - > - v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; > - return v * gtod->clock.mult; > -} > - > -static int do_monotonic_boot(s64 *t, u64 *cycle_now) > -{ > - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; > - unsigned long seq; > - int mode; > - u64 ns; > - > - do { > - seq = read_seqcount_begin(>od->seq); > - mode = gtod->clock.vclock_mode; > - ns = gtod->nsec_base; > - ns += vgettsc(cycle_now); > - ns >>= gtod->clock.shift; > - ns += gtod->boot_ns; > - } while (unlikely(read_seqcount_retry(>od->seq, seq))); > - *t = ns; > - > - return mode; > -} > - > -static int do_realtime(struct timespec *ts, u64 *cycle_now) > -{ > - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; > - unsigned long seq; > - int mode; > - u64 ns; > - > - do { > - seq = read_seqcount_begin(>od->seq); > - mode = gtod->clock.vclock_mode; > - ts->tv_sec = gtod->wall_time_sec; > - ns = gtod->nsec_base; > - ns += vgettsc(cycle_now); > - ns >>= gtod->clock.shift; > - } while (unlikely(read_seqcount_retry(>od->seq, seq))); > - > - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); > - ts->tv_nsec = ns; > - > - return mode; > -} > - > /* returns true if host is using tsc clocksource */ > static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) > { > @@ -1713,34 +1589,28 @@ static bool kvm_get_walltime_and_clockread(struct timespec *ts, > * > */ > > -static void pvclock_update_vm_gtod_copy(struct kvm *kvm) > +static void update_masterclock(struct kvm *kvm) > { > #ifdef CONFIG_X86_64 > struct kvm_arch *ka = &kvm->arch; > - int vclock_mode; > - bool host_tsc_clocksource, vcpus_matched; > + bool host_clocksource_stable, vcpus_matched; > > vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == > atomic_read(&kvm->online_vcpus)); > > /* > - * If the host uses TSC clock, then passthrough TSC as stable > - * to the guest. > + * kvm_get_time_and_clockread returns true if clocksource is stable > */ > - host_tsc_clocksource = kvm_get_time_and_clockread( > + host_clocksource_stable = kvm_get_time_and_clockread( > &ka->master_kernel_ns, > &ka->master_cycle_now); > > - ka->use_master_clock = host_tsc_clocksource && vcpus_matched > + ka->use_master_clock = host_clocksource_stable && vcpus_matched > && !ka->backwards_tsc_observed > && !ka->boot_vcpu_runs_old_kvmclock; > > - if (ka->use_master_clock) > - atomic_set(&kvm_guest_has_master_clock, 1); > - > - vclock_mode = pvclock_gtod_data.clock.vclock_mode; > - trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, > - vcpus_matched); > + trace_kvm_update_master_clock(ka->use_master_clock, > + host_clocksource_stable, vcpus_matched); > #endif > } > > @@ -1756,10 +1626,10 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) > struct kvm_vcpu *vcpu; > struct kvm_arch *ka = &kvm->arch; > > - spin_lock(&ka->pvclock_gtod_sync_lock); > + spin_lock(&ka->masterclock_lock); > kvm_make_mclock_inprogress_request(kvm); > /* no guest entries from this point */ > - pvclock_update_vm_gtod_copy(kvm); > + update_masterclock(kvm); > > kvm_for_each_vcpu(i, vcpu, kvm) > kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); > @@ -1768,7 +1638,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) > kvm_for_each_vcpu(i, vcpu, kvm) > kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); > > - spin_unlock(&ka->pvclock_gtod_sync_lock); > + spin_unlock(&ka->masterclock_lock); > #endif > } > > @@ -1778,15 +1648,15 @@ u64 get_kvmclock_ns(struct kvm *kvm) > struct pvclock_vcpu_time_info hv_clock; > u64 ret; > > - spin_lock(&ka->pvclock_gtod_sync_lock); > + spin_lock(&ka->masterclock_lock); > if (!ka->use_master_clock) { > - spin_unlock(&ka->pvclock_gtod_sync_lock); > + spin_unlock(&ka->masterclock_lock); > return ktime_get_boot_ns() + ka->kvmclock_offset; > } > > hv_clock.tsc_timestamp = ka->master_cycle_now; > hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; > - spin_unlock(&ka->pvclock_gtod_sync_lock); > + spin_unlock(&ka->masterclock_lock); > > /* both __this_cpu_read() and rdtsc() should be on the same cpu */ > get_cpu(); > @@ -1872,13 +1742,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) > * If the host uses TSC clock, then passthrough TSC as stable > * to the guest. > */ > - spin_lock(&ka->pvclock_gtod_sync_lock); > + spin_lock(&ka->masterclock_lock); > use_master_clock = ka->use_master_clock; > if (use_master_clock) { > host_tsc = ka->master_cycle_now; > kernel_ns = ka->master_kernel_ns; > } > - spin_unlock(&ka->pvclock_gtod_sync_lock); > + spin_unlock(&ka->masterclock_lock); > > /* Keep irq disabled to prevent changes to the clock */ > local_irq_save(flags); > @@ -4208,11 +4078,7 @@ long kvm_arch_vm_ioctl(struct file *filp, > goto out; > > r = 0; > - /* > - * TODO: userspace has to take care of races with VCPU_RUN, so > - * kvm_gen_update_masterclock() can be cut down to locked > - * pvclock_update_vm_gtod_copy(). > - */ I have no idea what race is this.. do you know? > + > kvm_gen_update_masterclock(kvm); > now_ns = get_kvmclock_ns(kvm); > kvm->arch.kvmclock_offset += user_ns.clock - now_ns; > @@ -6041,7 +5907,8 @@ static void kvm_set_mmio_spte_mask(void) > } > > #ifdef CONFIG_X86_64 > -static void pvclock_gtod_update_fn(struct work_struct *work) > +static int process_clocksource_change(struct notifier_block *nb, > + unsigned long unused0, void *unused1) > { > struct kvm *kvm; > > @@ -6052,35 +5919,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) > list_for_each_entry(kvm, &vm_list, vm_list) > kvm_for_each_vcpu(i, vcpu, kvm) > kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); > - atomic_set(&kvm_guest_has_master_clock, 0); > spin_unlock(&kvm_lock); > -} > - > -static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); > - > -/* > - * Notification about pvclock gtod data update. > - */ > -static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, > - void *priv) > -{ > - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; > - struct timekeeper *tk = priv; > - > - update_pvclock_gtod(tk); > - > - /* disable master clock if host does not trust, or does not > - * use, TSC clocksource > - */ > - if (gtod->clock.vclock_mode != VCLOCK_TSC && > - atomic_read(&kvm_guest_has_master_clock) != 0) > - queue_work(system_long_wq, &pvclock_gtod_work); Don't drop this: TSC is required, and switching to another clock must disable masterclock scheme. > - > return 0; > } > > -static struct notifier_block pvclock_gtod_notifier = { > - .notifier_call = pvclock_gtod_notify, > +static struct notifier_block clocksource_change_notifier = { > + .notifier_call = process_clocksource_change, > }; > #endif > > @@ -6133,7 +5977,7 @@ int kvm_arch_init(void *opaque) > > kvm_lapic_init(); > #ifdef CONFIG_X86_64 > - pvclock_gtod_register_notifier(&pvclock_gtod_notifier); > + clocksource_changes_register_notifier(&clocksource_change_notifier); > #endif > > return 0; > @@ -6154,7 +5998,7 @@ void kvm_arch_exit(void) > CPUFREQ_TRANSITION_NOTIFIER); > cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); > #ifdef CONFIG_X86_64 > - pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); > + clocksource_changes_unregister_notifier(&clocksource_change_notifier); > #endif > kvm_x86_ops = NULL; > kvm_mmu_module_exit(); > @@ -8056,10 +7900,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) > raw_spin_lock_init(&kvm->arch.tsc_write_lock); > mutex_init(&kvm->arch.apic_map_lock); > mutex_init(&kvm->arch.hyperv.hv_lock); > - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); > + spin_lock_init(&kvm->arch.masterclock_lock); > > kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); > - pvclock_update_vm_gtod_copy(kvm); > + update_masterclock(kvm); > > INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); > INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); > -- > 2.7.4