On Fri, Jul 30, 2021 at 10:48 AM Sean Christopherson <seanjc@xxxxxxxxxx> wrote: > > On Thu, Jul 29, 2021, Oliver Upton wrote: > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 916c976e99ab..e052c7afaac4 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -2782,17 +2782,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) > > #endif > > } > > > > -u64 get_kvmclock_ns(struct kvm *kvm) > > +/* > > + * Returns true if realtime and TSC values were written back to the caller. > > + * Returns false if a clock triplet cannot be obtained, such as if the host's > > + * realtime clock is not based on the TSC. > > + */ > > +static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns, > > + u64 *realtime_ns, u64 *tsc) > > { > > struct kvm_arch *ka = &kvm->arch; > > struct pvclock_vcpu_time_info hv_clock; > > unsigned long flags; > > - u64 ret; > > + bool ret = false; > > > > spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); > > if (!ka->use_master_clock) { > > spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); > > - return get_kvmclock_base_ns() + ka->kvmclock_offset; > > + *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset; > > + return false; > > } > > > > hv_clock.tsc_timestamp = ka->master_cycle_now; > > @@ -2803,18 +2810,36 @@ u64 get_kvmclock_ns(struct kvm *kvm) > > get_cpu(); > > > > if (__this_cpu_read(cpu_tsc_khz)) { > > + struct timespec64 ts; > > + u64 tsc_val; > > + > > kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, > > &hv_clock.tsc_shift, > > &hv_clock.tsc_to_system_mul); > > - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); > > + > > + if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) { > > + *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; > > + *tsc = tsc_val; > > + ret = true; > > + } > > + > > + *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val); > > This is buggy, as tsc_val is not initialized if kvm_get_walltime_and_clockread() > returns false. This also straight up fails to compile on 32-bit kernels because > kvm_get_walltime_and_clockread() is wrapped with CONFIG_X86_64. Pssh... works on my machine ;-) > > A gross way to resolve both issues would be (see below regarding 'data'): > > if (__this_cpu_read(cpu_tsc_khz)) { > #ifdef CONFIG_X86_64 > struct timespec64 ts; > > if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { > data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; > data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; > } else > #endif > data->host_tsc = rdtsc(); > > kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, > &hv_clock.tsc_shift, > &hv_clock.tsc_to_system_mul); > > data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); > } else { > data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; > } > I'll mull on this if there's any cleaner way to fix it, but thanks for the suggested fix! I missed the x86_64 ifdefs first time around. > > } else > > Not your code, but this needs braces. Ack. > > > - ret = get_kvmclock_base_ns() + ka->kvmclock_offset; > > + *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset; > > > > put_cpu(); > > > > return ret; > > } > > ... > > > @@ -5820,6 +5845,68 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) > > } > > #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ > > > > +static int kvm_vm_ioctl_get_clock(struct kvm *kvm, > > + void __user *argp) > > +{ > > + struct kvm_clock_data data; > > + > > + memset(&data, 0, sizeof(data)); > > + > > + if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime, > > + &data.host_tsc)) > > + data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; > > + > > + if (kvm->arch.use_master_clock) > > + data.flags |= KVM_CLOCK_TSC_STABLE; > > I know nothing about the actual behavior, but this appears to have a potential > race (though it came from the existing code). get_kvmclock_and_realtime() checks > arch.use_master_clock under pvclock_gtod_sync_lock, but this does not. Even if > that's functionally ok, it's a needless complication. > > Fixing that weirdness would also provide an opportunity to clean up the API, > e.g. the boolean return is not exactly straightforward. The simplest approach > would be to take "struct kvm_clock_data *data" in get_kvmclock_and_realtime() > instead of multiple params. Then that helper can set the flags accordingly, thus > avoiding the question of whether or not there's a race and eliminating the boolean > return. E.g. Yeah, agreed. I think it may be best to separate fixing the mess from the new API here. > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index e052c7afaac4..872a53a7c467 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2782,25 +2782,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) > #endif > } > > -/* > - * Returns true if realtime and TSC values were written back to the caller. > - * Returns false if a clock triplet cannot be obtained, such as if the host's > - * realtime clock is not based on the TSC. > - */ > -static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns, > - u64 *realtime_ns, u64 *tsc) > +static void get_kvmclock_and_realtime(struct kvm *kvm, > + struct kvm_clock_data *data) > { > struct kvm_arch *ka = &kvm->arch; > struct pvclock_vcpu_time_info hv_clock; > unsigned long flags; > - bool ret = false; > > spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); > if (!ka->use_master_clock) { > spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); > - *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset; > - return false; > + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; > + return; > } > + data->flags |= KVM_CLOCK_TSC_STABLE; > > hv_clock.tsc_timestamp = ka->master_cycle_now; > hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; > @@ -2810,34 +2805,40 @@ static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns, > get_cpu(); > > if (__this_cpu_read(cpu_tsc_khz)) { > +#ifdef CONFIG_X86_64 > struct timespec64 ts; > - u64 tsc_val; > + > + if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { > + data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; > + data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; > + } else > +#endif > + data->host_tsc = rdtsc(); > > kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, > &hv_clock.tsc_shift, > &hv_clock.tsc_to_system_mul); > > - if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) { > - *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; > - *tsc = tsc_val; > - ret = true; > - } > - > - *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val); > - } else > - *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset; > + data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); > + } else { > + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; > + } > > put_cpu(); > - > - return ret; > } > > u64 get_kvmclock_ns(struct kvm *kvm) > { > - u64 kvmclock_ns, realtime_ns, tsc; > + struct kvm_clock_data data; > > - get_kvmclock_and_realtime(kvm, &kvmclock_ns, &realtime_ns, &tsc); > - return kvmclock_ns; > + /* > + * Zero flags as it's accessed RMW, leave everything else uninitialized > + * as clock is always written and no other fields are consumed. > + */ > + data.flags = 0; > + > + get_kvmclock_and_realtime(kvm, &data); > + return data.clock; > } > > static void kvm_setup_pvclock_page(struct kvm_vcpu *v, > @@ -5852,12 +5853,7 @@ static int kvm_vm_ioctl_get_clock(struct kvm *kvm, > > memset(&data, 0, sizeof(data)); > > - if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime, > - &data.host_tsc)) > - data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; > - > - if (kvm->arch.use_master_clock) > - data.flags |= KVM_CLOCK_TSC_STABLE; > + get_kvmclock_and_realtime(kvm, &data); > > if (copy_to_user(argp, &data, sizeof(data))) > return -EFAULT; > > > + > > + if (copy_to_user(argp, &data, sizeof(data))) > > + return -EFAULT; > > + > > + return 0; > > +} > > + > > +static int kvm_vm_ioctl_set_clock(struct kvm *kvm, > > + void __user *argp) > > +{ > > + struct kvm_arch *ka = &kvm->arch; > > + struct kvm_clock_data data; > > + u64 now_raw_ns; > > + > > + if (copy_from_user(&data, argp, sizeof(data))) > > + return -EFAULT; > > + > > + if (data.flags & ~KVM_CLOCK_REALTIME) > > + return -EINVAL; > > Huh, this an odd ABI, the output of KVM_GET_CLOCK isn't legal input to KVM_SET_CLOCK. > The existing code has the same behavior, so apparently it's ok, just odd. Quite the head scratcher to me as well /shrug > > + > > + /* > > + * TODO: userspace has to take care of races with VCPU_RUN, so > > + * kvm_gen_update_masterclock() can be cut down to locked > > + * pvclock_update_vm_gtod_copy(). > > + */ > > + kvm_gen_update_masterclock(kvm); > > + > > + spin_lock_irq(&ka->pvclock_gtod_sync_lock); > > + if (data.flags & KVM_CLOCK_REALTIME) { > > + u64 now_real_ns = ktime_get_real_ns(); > > + > > + /* > > + * Avoid stepping the kvmclock backwards. > > + */ > > + if (now_real_ns > data.realtime) > > + data.clock += now_real_ns - data.realtime; > > + } > > + > > + if (ka->use_master_clock) > > + now_raw_ns = ka->master_kernel_ns; > > + else > > + now_raw_ns = get_kvmclock_base_ns(); > > + ka->kvmclock_offset = data.clock - now_raw_ns; > > + spin_unlock_irq(&ka->pvclock_gtod_sync_lock); > > + > > + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); > > + return 0; > > +} > > + > > long kvm_arch_vm_ioctl(struct file *filp, > > unsigned int ioctl, unsigned long arg) > > { > > @@ -6064,57 +6151,11 @@ long kvm_arch_vm_ioctl(struct file *filp, > > } > > #endif > > case KVM_SET_CLOCK: { > > The curly braces can be dropped, both here and in KVM_GET_CLOCK. Ack. > > } > > case KVM_GET_CLOCK: { > > ... > > > } As always, thanks for the careful review Sean! -- Best, Oliver