Don't snapshot tsc_khz into per-cpu cpu_tsc_khz if the host TSC is constant, in which case the actual TSC frequency will never change and thus capturing TSC during initialization is unnecessary, KVM can simply use tsc_khz. This value is snapshotted from kvm_timer_init->kvmclock_cpu_online->tsc_khz_changed(NULL) On CPUs with constant TSC, but not a hardware-specified TSC frequency, snapshotting cpu_tsc_khz and using that to set a VM's target TSC frequency can lead to VM to think its TSC frequency is not what it actually is if refining the TSC completes after KVM snapshots tsc_khz. The actual frequency never changes, only the kernel's calculation of what that frequency is changes. Ideally, KVM would not be able to race with TSC refinement, or would have a hook into tsc_refine_calibration_work() to get an alert when refinement is complete. Avoiding the race altogether isn't practical as refinement takes a relative eternity; it's deliberately put on a work queue outside of the normal boot sequence to avoid unnecessarily delaying boot. Adding a hook is doable, but somewhat gross due to KVM's ability to be built as a module. And if the TSC is constant, which is likely the case for every VMX/SVM-capable CPU produced in the last decade, the race can be hit if and only if userspace is able to create a VM before TSC refinement completes; refinement is slow, but not that slow. For now, punt on a proper fix, as not taking a snapshot can help some uses cases and not taking a snapshot is arguably correct irrespective of the race with refinement. Signed-off-by: Anton Romanov <romanton@xxxxxxxxxx> --- v4 : * minor feedback changes * skip updating per-cpu tsc in kvm_hyperv_tsc_notifier arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4790f0d7d40b..f0b4d5ae743b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2907,6 +2907,22 @@ static void kvm_update_masterclock(struct kvm *kvm) kvm_end_pvclock_update(kvm); } +/* + * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's + * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz + * can change during boot even if the TSC is constant, as it's possible for KVM + * to be loaded before TSC calibration completes. Ideally, KVM would get a + * notification when calibration completes, but practically speaking calibration + * will complete before userspace is alive enough to create VMs. + */ +static unsigned long get_cpu_tsc_khz(void) +{ + if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { @@ -2917,7 +2933,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) get_cpu(); data->flags = 0; - if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { + if (ka->use_master_clock && + (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { #ifdef CONFIG_X86_64 struct timespec64 ts; @@ -2931,7 +2948,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) data->flags |= KVM_CLOCK_TSC_STABLE; hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); @@ -3049,7 +3066,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + tgt_tsc_khz = get_cpu_tsc_khz(); if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -8646,9 +8663,10 @@ static void tsc_khz_changed(void *data) struct cpufreq_freqs *freq = data; unsigned long khz = 0; + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); if (data) khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + else khz = cpufreq_quick_get(raw_smp_processor_id()); if (!khz) khz = tsc_khz; @@ -8669,8 +8687,10 @@ static void kvm_hyperv_tsc_notifier(void) hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ - for_each_present_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + } kvm_max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { @@ -8783,7 +8803,8 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = { static int kvmclock_cpu_online(unsigned int cpu) { - tsc_khz_changed(NULL); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + tsc_khz_changed(NULL); return 0; } -- 2.36.0.550.gb090851708-goog