Thanks to a set of recently added timekeeper functions providing the cycles stamp along with the kernel time, now we have an ability to get time values right from the kerenl avoiding supporting a shadow copy of timekeeper data structures. This reduces overheads and complexity of the KVM code and makes time operations more clear. Signed-off-by: Denis Plotnikov <dplotnikov@xxxxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/trace.h | 27 +++-- arch/x86/kvm/x86.c | 259 ++++++++++++---------------------------- 3 files changed, 89 insertions(+), 199 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 695605e..27a2df9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -781,7 +781,7 @@ struct kvm_arch { u64 cur_tsc_generation; int nr_vcpus_matched_tsc; - spinlock_t pvclock_gtod_sync_lock; + spinlock_t masterclock_lock; bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0a6cc67..5ed12fe 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -812,40 +812,41 @@ TRACE_EVENT(kvm_write_tsc_offset, {VCLOCK_TSC, "tsc"} \ TRACE_EVENT(kvm_update_master_clock, - TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), - TP_ARGS(use_master_clock, host_clock, offset_matched), + TP_PROTO(bool use_master_clock, bool host_clock_stable, + bool offset_matched), + TP_ARGS(use_master_clock, host_clock_stable, offset_matched), TP_STRUCT__entry( __field( bool, use_master_clock ) - __field( unsigned int, host_clock ) + __field( bool, host_clock_stable ) __field( bool, offset_matched ) ), TP_fast_assign( __entry->use_master_clock = use_master_clock; - __entry->host_clock = host_clock; + __entry->host_clock_stable = host_clock_stable; __entry->offset_matched = offset_matched; ), - TP_printk("masterclock %d hostclock %s offsetmatched %u", + TP_printk("masterclock %d hostclock stable %u offsetmatched %u", __entry->use_master_clock, - __print_symbolic(__entry->host_clock, host_clocks), + __entry->host_clock_stable, __entry->offset_matched) ); TRACE_EVENT(kvm_track_tsc, TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, unsigned int online_vcpus, bool use_master_clock, - unsigned int host_clock), + bool host_clock_stable), TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, - host_clock), + host_clock_stable), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( unsigned int, nr_vcpus_matched_tsc ) __field( unsigned int, online_vcpus ) __field( bool, use_master_clock ) - __field( unsigned int, host_clock ) + __field( bool, host_clock_stable ) ), TP_fast_assign( @@ -853,14 +854,14 @@ TRACE_EVENT(kvm_track_tsc, __entry->nr_vcpus_matched_tsc = nr_matched; __entry->online_vcpus = online_vcpus; __entry->use_master_clock = use_master_clock; - __entry->host_clock = host_clock; + __entry->host_clock_stable = host_clock_stable; ), - TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" - " hostclock %s", + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u " + "hostclock stable %u", __entry->vcpu_id, __entry->use_master_clock, __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, - __print_symbolic(__entry->host_clock, host_clocks)) + __entry->host_clock_stable) ); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e846f0..ce491bb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -50,7 +50,7 @@ #include <linux/hash.h> #include <linux/pci.h> #include <linux/timekeeper_internal.h> -#include <linux/pvclock_gtod.h> +#include <linux/cs_notifier.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> @@ -1131,50 +1131,6 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return kvm_set_msr(vcpu, &msr); } -#ifdef CONFIG_X86_64 -struct pvclock_gtod_data { - seqcount_t seq; - - struct { /* extract of a clocksource struct */ - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - } clock; - - u64 boot_ns; - u64 nsec_base; - u64 wall_time_sec; -}; - -static struct pvclock_gtod_data pvclock_gtod_data; - -static void update_pvclock_gtod(struct timekeeper *tk) -{ - struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns; - - boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); - - write_seqcount_begin(&vdata->seq); - - /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->tkr_mono.cycle_last; - vdata->clock.mask = tk->tkr_mono.mask; - vdata->clock.mult = tk->tkr_mono.mult; - vdata->clock.shift = tk->tkr_mono.shift; - - vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr_mono.xtime_nsec; - - vdata->wall_time_sec = tk->xtime_sec; - - write_seqcount_end(&vdata->seq); -} -#endif - void kvm_set_pending_timer(struct kvm_vcpu *vcpu) { /* @@ -1266,10 +1222,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, __func__, base_hz, scaled_hz, shift, *pmultiplier); } -#ifdef CONFIG_X86_64 -static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); -#endif - static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); static unsigned long max_tsc_khz; @@ -1358,12 +1310,32 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } +#ifdef CONFIG_X86_64 +static bool clocksource_stable(void) +{ + return get_tk_mono_clock_mode() == VCLOCK_TSC; +} + +static bool clocksource_stability_check(void) +{ + unsigned int seq; + const seqcount_t *s = get_tk_seq(); + bool stable; + + { + seq = read_seqcount_begin(s); + stable = clocksource_stable(); + } while (unlikely(read_seqcount_retry(s, seq))); + + return stable; +} +#endif + static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 - bool vcpus_matched; + bool vcpus_matched, clocksource_stable; struct kvm_arch *ka = &vcpu->kvm->arch; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&vcpu->kvm->online_vcpus)); @@ -1376,13 +1348,14 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * and the vcpus need to have matched TSCs. When that happens, * perform request to enable masterclock. */ + clocksource_stable = clocksource_stability_check(); if (ka->use_master_clock || - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) + (clocksource_stable && vcpus_matched)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, - atomic_read(&vcpu->kvm->online_vcpus), - ka->use_master_clock, gtod->clock.vclock_mode); + atomic_read(&vcpu->kvm->online_vcpus), + ka->use_master_clock, clocksource_stable); #endif } @@ -1535,7 +1508,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + spin_lock(&kvm->arch.masterclock_lock); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { @@ -1543,7 +1516,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) } kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); + spin_unlock(&kvm->arch.masterclock_lock); } EXPORT_SYMBOL_GPL(kvm_write_tsc); @@ -1563,99 +1536,41 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) } #ifdef CONFIG_X86_64 - -static u64 read_tsc(void) +static bool kvm_get_host_time_and_cycles(s64 *kernel_ns, u64 *cycle_now, + u64 (*get_time)(u64 *cycle_now)) { - u64 ret = (u64)rdtsc_ordered(); - u64 last = pvclock_gtod_data.clock.cycle_last; + unsigned int seq; + const seqcount_t *s = get_tk_seq(); + bool stable; - if (likely(ret >= last)) - return ret; + { + seq = read_seqcount_begin(s); + stable = clocksource_stable(); + if (stable) + *kernel_ns = get_time(cycle_now); + } while (unlikely(read_seqcount_retry(s, seq))); - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a function of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - -static inline u64 vgettsc(u64 *cycle_now) -{ - long v; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - - *cycle_now = read_tsc(); - - v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; - return v * gtod->clock.mult; -} - -static int do_monotonic_boot(s64 *t, u64 *cycle_now) -{ - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - unsigned long seq; - int mode; - u64 ns; - - do { - seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; - ns = gtod->nsec_base; - ns += vgettsc(cycle_now); - ns >>= gtod->clock.shift; - ns += gtod->boot_ns; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); - *t = ns; - - return mode; -} - -static int do_realtime(struct timespec *ts, u64 *cycle_now) -{ - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - unsigned long seq; - int mode; - u64 ns; - - do { - seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; - ts->tv_sec = gtod->wall_time_sec; - ns = gtod->nsec_base; - ns += vgettsc(cycle_now); - ns >>= gtod->clock.shift; - } while (unlikely(read_seqcount_retry(>od->seq, seq))); - - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return mode; + return stable; } /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) { - /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) - return false; - - return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; + return kvm_get_host_time_and_cycles( + kernel_ns, cycle_now, ktime_get_boot_ns_with_cycles); } /* returns true if host is using tsc clocksource */ -static bool kvm_get_walltime_and_clockread(struct timespec *ts, - u64 *cycle_now) +static bool kvm_get_walltime_and_clockread(struct timespec *ts, u64 *cycle_now) { - /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) - return false; + bool res; + s64 kernel_ns; - return do_realtime(ts, cycle_now) == VCLOCK_TSC; + res = kvm_get_host_time_and_cycles( + &kernel_ns, cycle_now, ktime_get_real_ns_with_cycles); + *ts = ktime_to_timespec(kernel_ns); + + return res; } #endif @@ -1700,12 +1615,11 @@ static bool kvm_get_walltime_and_clockread(struct timespec *ts, * */ -static void pvclock_update_vm_gtod_copy(struct kvm *kvm) +static void update_masterclock(struct kvm *kvm) { #ifdef CONFIG_X86_64 struct kvm_arch *ka = &kvm->arch; - int vclock_mode; - bool host_tsc_clocksource, vcpus_matched; + bool host_clocksource_stable, vcpus_matched; vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); @@ -1714,20 +1628,16 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - host_tsc_clocksource = kvm_get_time_and_clockread( + host_clocksource_stable = kvm_get_time_and_clockread( &ka->master_kernel_ns, &ka->master_cycle_now); - ka->use_master_clock = host_tsc_clocksource && vcpus_matched + ka->use_master_clock = host_clocksource_stable && vcpus_matched && !backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; - if (ka->use_master_clock) - atomic_set(&kvm_guest_has_master_clock, 1); - - vclock_mode = pvclock_gtod_data.clock.vclock_mode; - trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, - vcpus_matched); + trace_kvm_update_master_clock(ka->use_master_clock, + host_clocksource_stable, vcpus_matched); #endif } @@ -1743,10 +1653,10 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock(&ka->masterclock_lock); kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - pvclock_update_vm_gtod_copy(kvm); + update_masterclock(kvm); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -1755,7 +1665,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock(&ka->masterclock_lock); #endif } @@ -1765,15 +1675,15 @@ u64 get_kvmclock_ns(struct kvm *kvm) struct pvclock_vcpu_time_info hv_clock; u64 ret; - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock(&ka->masterclock_lock); if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock(&ka->masterclock_lock); return ktime_get_boot_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock(&ka->masterclock_lock); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); @@ -1859,13 +1769,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock(&ka->masterclock_lock); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock(&ka->masterclock_lock); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -6015,7 +5925,8 @@ static void kvm_set_mmio_spte_mask(void) } #ifdef CONFIG_X86_64 -static void pvclock_gtod_update_fn(struct work_struct *work) +static int process_clocksource_change(struct notifier_block *nb, + unsigned long unused0, void *unused1) { struct kvm *kvm; @@ -6026,35 +5937,13 @@ static void pvclock_gtod_update_fn(struct work_struct *work) list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - atomic_set(&kvm_guest_has_master_clock, 0); spin_unlock(&kvm_lock); -} - -static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); - -/* - * Notification about pvclock gtod data update. - */ -static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, - void *priv) -{ - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - struct timekeeper *tk = priv; - - update_pvclock_gtod(tk); - - /* disable master clock if host does not trust, or does not - * use, TSC clocksource - */ - if (gtod->clock.vclock_mode != VCLOCK_TSC && - atomic_read(&kvm_guest_has_master_clock) != 0) - queue_work(system_long_wq, &pvclock_gtod_work); - return 0; } -static struct notifier_block pvclock_gtod_notifier = { - .notifier_call = pvclock_gtod_notify, + +static struct notifier_block clocksource_changes_notifier = { + .notifier_call = process_clocksource_change, }; #endif @@ -6107,7 +5996,7 @@ int kvm_arch_init(void *opaque) kvm_lapic_init(); #ifdef CONFIG_X86_64 - pvclock_gtod_register_notifier(&pvclock_gtod_notifier); + clocksource_changes_register_notifier(&clocksource_changes_notifier); #endif return 0; @@ -6128,7 +6017,7 @@ void kvm_arch_exit(void) CPUFREQ_TRANSITION_NOTIFIER); cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); #ifdef CONFIG_X86_64 - pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); + clocksource_changes_unregister_notifier(&clocksource_changes_notifier); #endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); @@ -8031,10 +7920,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); mutex_init(&kvm->arch.hyperv.hv_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + spin_lock_init(&kvm->arch.masterclock_lock); kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); - pvclock_update_vm_gtod_copy(kvm); + update_masterclock(kvm); INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); -- 2.7.4