Hi Paolo, On Wed, Aug 11, 2021 at 5:23 AM Paolo Bonzini <pbonzini@xxxxxxxxxx> wrote: > > On 04/08/21 10:57, Oliver Upton wrote: > > Sean noticed that KVM_GET_CLOCK was checking kvm_arch.use_master_clock > > outside of the pvclock sync lock. This is problematic, as the clock > > value written to the user may or may not actually correspond to a stable > > TSC. > > > > Fix the race by populating the entire kvm_clock_data structure behind > > the pvclock_gtod_sync_lock. > > > > Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx> > > Signed-off-by: Oliver Upton <oupton@xxxxxxxxxx> > > --- > > arch/x86/kvm/x86.c | 39 ++++++++++++++++++++++++++++----------- > > 1 file changed, 28 insertions(+), 11 deletions(-) > > I had a completely independent patch that fixed the same race. It unifies > the read sides of tsc_write_lock and pvclock_gtod_sync_lock into a seqcount > (and replaces pvclock_gtod_sync_lock with kvm->lock on the write side). Might it make sense to fix this issue under the existing locking scheme, then shift to what you're proposing? I say that, but the locking change in 03/21 would most certainly have a short lifetime until this patch supersedes it. > I attach it now (based on https://lore.kernel.org/kvm/20210811102356.3406687-1-pbonzini@xxxxxxxxxx/T/#t), > but the testing was extremely light so I'm not sure I will be able to include > it in 5.15. > > Paolo > > -------------- 8< ------------- > From: Paolo Bonzini <pbonzini@xxxxxxxxxx> > Date: Thu, 8 Apr 2021 05:03:44 -0400 > Subject: [PATCH] kvm: x86: protect masterclock with a seqcount > > Protect the reference point for kvmclock with a seqcount, so that > kvmclock updates for all vCPUs can proceed in parallel. Xen runstate > updates will also run in parallel and not bounce the kvmclock cacheline. > > This also makes it possible to use KVM_REQ_CLOCK_UPDATE (which will > block on the seqcount) to prevent entering in the guests until > pvclock_update_vm_gtod_copy is complete, and thus to get rid of > KVM_REQ_MCLOCK_INPROGRESS. > > nr_vcpus_matched_tsc is updated outside pvclock_update_vm_gtod_copy > though, so a spinlock must be kept for that one. > > Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > > diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst > index 8138201efb09..ed41da172e02 100644 > --- a/Documentation/virt/kvm/locking.rst > +++ b/Documentation/virt/kvm/locking.rst > @@ -29,6 +29,8 @@ The acquisition orders for mutexes are as follows: > > On x86: > > +- the seqcount kvm->arch.pvclock_sc is written under kvm->lock. > + > - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock > > - kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 20daaf67a5bf..6889aab92362 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -68,8 +68,7 @@ > #define KVM_REQ_PMI KVM_ARCH_REQ(11) > #define KVM_REQ_SMI KVM_ARCH_REQ(12) > #define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) > -#define KVM_REQ_MCLOCK_INPROGRESS \ > - KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) > +/* 14 unused */ > #define KVM_REQ_SCAN_IOAPIC \ > KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) > #define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16) > @@ -1067,6 +1066,11 @@ struct kvm_arch { > > unsigned long irq_sources_bitmap; > s64 kvmclock_offset; > + > + /* > + * This also protects nr_vcpus_matched_tsc which is read from a > + * preemption-disabled region, so it must be a raw spinlock. > + */ > raw_spinlock_t tsc_write_lock; > u64 last_tsc_nsec; > u64 last_tsc_write; > @@ -1077,7 +1081,7 @@ struct kvm_arch { > u64 cur_tsc_generation; > int nr_vcpus_matched_tsc; > > - spinlock_t pvclock_gtod_sync_lock; > + seqcount_mutex_t pvclock_sc; > bool use_master_clock; > u64 master_kernel_ns; > u64 master_cycle_now; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 74145a3fd4f2..7d73c5560260 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2533,9 +2533,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) > vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; > > kvm_vcpu_write_tsc_offset(vcpu, offset); > - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); > > - spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); > if (!matched) { > kvm->arch.nr_vcpus_matched_tsc = 0; > } else if (!already_matched) { > @@ -2543,7 +2541,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) > } > > kvm_track_tsc_matching(vcpu); > - spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); > + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); > } > > static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, > @@ -2730,9 +2728,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) > struct kvm_arch *ka = &kvm->arch; > int vclock_mode; > bool host_tsc_clocksource, vcpus_matched; > - > - vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == > - atomic_read(&kvm->online_vcpus)); > + unsigned long flags; > > /* > * If the host uses TSC clock, then passthrough TSC as stable > @@ -2742,9 +2738,14 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) > &ka->master_kernel_ns, > &ka->master_cycle_now); > > + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); > + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == > + atomic_read(&kvm->online_vcpus)); > + > ka->use_master_clock = host_tsc_clocksource && vcpus_matched > && !ka->backwards_tsc_observed > && !ka->boot_vcpu_runs_old_kvmclock; > + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); > > if (ka->use_master_clock) > atomic_set(&kvm_guest_has_master_clock, 1); > @@ -2758,25 +2759,26 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) > static void kvm_start_pvclock_update(struct kvm *kvm) > { > struct kvm_arch *ka = &kvm->arch; > - kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); > > - /* no guest entries from this point */ > - spin_lock_irq(&ka->pvclock_gtod_sync_lock); > + mutex_lock(&kvm->lock); > + /* > + * write_seqcount_begin disables preemption. This is needed not just > + * to avoid livelock, but also because the preempt notifier is a reader > + * for ka->pvclock_sc. > + */ > + write_seqcount_begin(&ka->pvclock_sc); > + kvm_make_all_cpus_request(kvm, > + KVM_REQ_CLOCK_UPDATE | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP); > + > + /* no guest entries from this point until write_seqcount_end */ > } > > static void kvm_end_pvclock_update(struct kvm *kvm) > { > struct kvm_arch *ka = &kvm->arch; > - struct kvm_vcpu *vcpu; > - int i; > > - spin_unlock_irq(&ka->pvclock_gtod_sync_lock); > - kvm_for_each_vcpu(i, vcpu, kvm) > - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); > - > - /* guest entries allowed */ > - kvm_for_each_vcpu(i, vcpu, kvm) > - kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); > + write_seqcount_end(&ka->pvclock_sc); > + mutex_unlock(&kvm->lock); > } > > static void kvm_update_masterclock(struct kvm *kvm) > @@ -2787,27 +2789,21 @@ static void kvm_update_masterclock(struct kvm *kvm) > kvm_end_pvclock_update(kvm); > } > > -u64 get_kvmclock_ns(struct kvm *kvm) > +static u64 __get_kvmclock_ns(struct kvm *kvm) > { > struct kvm_arch *ka = &kvm->arch; > struct pvclock_vcpu_time_info hv_clock; > - unsigned long flags; > u64 ret; > > - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); > - if (!ka->use_master_clock) { > - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); > + if (!ka->use_master_clock) > return get_kvmclock_base_ns() + ka->kvmclock_offset; > - } > - > - hv_clock.tsc_timestamp = ka->master_cycle_now; > - hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; > - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); > > /* both __this_cpu_read() and rdtsc() should be on the same cpu */ > get_cpu(); > > if (__this_cpu_read(cpu_tsc_khz)) { > + hv_clock.tsc_timestamp = ka->master_cycle_now; > + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; > kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, > &hv_clock.tsc_shift, > &hv_clock.tsc_to_system_mul); > @@ -2816,6 +2812,19 @@ u64 get_kvmclock_ns(struct kvm *kvm) > ret = get_kvmclock_base_ns() + ka->kvmclock_offset; > > put_cpu(); > + return ret; > +} > + > +u64 get_kvmclock_ns(struct kvm *kvm) > +{ > + struct kvm_arch *ka = &kvm->arch; > + unsigned seq; > + u64 ret; > + > + do { > + seq = read_seqcount_begin(&ka->pvclock_sc); > + ret = __get_kvmclock_ns(kvm); > + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); > > return ret; > } > @@ -2882,6 +2891,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v, > static int kvm_guest_time_update(struct kvm_vcpu *v) > { > unsigned long flags, tgt_tsc_khz; > + unsigned seq; > struct kvm_vcpu_arch *vcpu = &v->arch; > struct kvm_arch *ka = &v->kvm->arch; > s64 kernel_ns; > @@ -2896,13 +2906,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) > * If the host uses TSC clock, then passthrough TSC as stable > * to the guest. > */ > - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); > - use_master_clock = ka->use_master_clock; > - if (use_master_clock) { > - host_tsc = ka->master_cycle_now; > - kernel_ns = ka->master_kernel_ns; > - } > - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); > + seq = read_seqcount_begin(&ka->pvclock_sc); > + do { > + use_master_clock = ka->use_master_clock; > + if (use_master_clock) { > + host_tsc = ka->master_cycle_now; > + kernel_ns = ka->master_kernel_ns; > + } > + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); > > /* Keep irq disabled to prevent changes to the clock */ > local_irq_save(flags); > @@ -6098,11 +6109,13 @@ long kvm_arch_vm_ioctl(struct file *filp, > } > case KVM_GET_CLOCK: { > struct kvm_clock_data user_ns; > - u64 now_ns; > + unsigned seq; > > - now_ns = get_kvmclock_ns(kvm); > - user_ns.clock = now_ns; > - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; > + do { > + seq = read_seqcount_begin(&kvm->arch.pvclock_sc); > + user_ns.clock = __get_kvmclock_ns(kvm); > + user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; > + } while (read_seqcount_retry(&kvm->arch.pvclock_sc, seq)); > memset(&user_ns.pad, 0, sizeof(user_ns.pad)); > > r = -EFAULT; > @@ -11144,8 +11157,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) > > raw_spin_lock_init(&kvm->arch.tsc_write_lock); > mutex_init(&kvm->arch.apic_map_lock); > - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); > - > + seqcount_mutex_init(&kvm->arch.pvclock_sc, &kvm->lock); > kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); > pvclock_update_vm_gtod_copy(kvm); > > This all looks good to me, so: Reviewed-by: Oliver Upton <oupton@xxxxxxxxxx> Definitely supplants 03/21 from my series. If you'd rather take your own for this entire series then I can rework around this patch and resend. -- Thanks, Oliver