On Thu, Aug 10, 2023 at 07:21:03AM +0800, Yan Zhao wrote: > On Wed, Aug 09, 2023 at 07:33:45AM -0700, Sean Christopherson wrote: > > On Wed, Aug 09, 2023, Yan Zhao wrote: > > > On Mon, Aug 07, 2023 at 10:19:07AM -0700, Sean Christopherson wrote: > > > > On Mon, Aug 07, 2023, Like Xu wrote: > > > > > On 23/12/2022 8:57 am, Sean Christopherson wrote: > > > > > > +static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, > > > > > > + const u8 *new, int bytes) > > > > > > +{ > > > > > > + __kvm_page_track_write(vcpu, gpa, new, bytes); > > > > > > + > > > > > > + kvm_mmu_track_write(vcpu, gpa, new, bytes); > > > > > > +} > > > > > > > > > > The kvm_mmu_track_write() is only used for x86, where the incoming parameter > > > > > "u8 *new" has not been required since 0e0fee5c539b ("kvm: mmu: Fix race in > > > > > emulated page table writes"), please help confirm if it's still needed ? Thanks. > > > > > A minor clean up is proposed. > > > > > > > > Hmm, unless I'm misreading things, KVMGT ultimately doesn't consume @new either. > > > > So I think we can remove @new from kvm_page_track_write() entirely. > > > Sorry for the late reply. > > > Yes, KVMGT does not consume @new and it reads the guest PTE again in the > > > page track write handler. > > > > > > But I have a couple of questions related to the memtioned commit as > > > below: > > > > > > (1) If "re-reading the current value of the guest PTE after the MMU lock has > > > been acquired", then should KVMGT also acquire the MMU lock too? > > > > No. If applicable, KVMGT should read the new/current value after acquiring > > whatever lock protects the generation (or update) of the shadow entries. I > > suspect KVMGT already does this, but I don't have time to confirm that at this > I think the mutex lock and unlock of info->vgpu_lock you added in > kvmgt_page_track_write() is the counterpart :) > > > exact memory. > > > > The race that was fixed in KVM was: > > > > vCPU0 vCPU1 > > write X > > write Y > > sync SPTE w/ Y > > sync SPTE w/ X > > > > Reading the value after acquiring mmu_lock ensures that both vCPUs will see whatever > > value "loses" the race, i.e. whatever written value is processed second ('Y' in the > > above sequence). > I suspect that vCPU0 may still generate a wrong SPTE if vCPU1 wrote 4 > bytes while vCPU0 wrote 8 bytes, though the chances are very low. > This could happen in below sequence: vCPU0 updates a PTE to AABBCCDD; vCPU1 updates a PTE to EEFFGGHH in two writes. (each character stands for a byte) vCPU0 vCPU1 write AABBCCDD write GGHH detect 4 bytes write and hold on sync sync SPTE w/ AABBGGHH write EEFF sync SPTE w/ EEFFGGHH Do you think it worth below serialization work? diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a915e23d61fa..51cd0ab73529 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1445,6 +1445,8 @@ struct kvm_arch { */ #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) struct kvm_mmu_memory_cache split_desc_cache; + + struct xarray track_writing_range; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index fd04e618ad2d..4b271701dcf6 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -142,12 +142,14 @@ void kvm_page_track_cleanup(struct kvm *kvm) head = &kvm->arch.track_notifier_head; cleanup_srcu_struct(&head->track_srcu); + xa_destroy(&kvm->arch.track_writing_range); } int kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; + xa_init(&kvm->arch.track_writing_range); head = &kvm->arch.track_notifier_head; INIT_HLIST_HEAD(&head->track_notifier_list); return init_srcu_struct(&head->track_srcu); diff --git a/arch/x86/kvm/mmu/page_track.h b/arch/x86/kvm/mmu/page_track.h index 62f98c6c5af3..1829792b9892 100644 --- a/arch/x86/kvm/mmu/page_track.h +++ b/arch/x86/kvm/mmu/page_track.h @@ -47,12 +47,46 @@ static inline bool kvm_page_track_has_external_user(struct kvm *kvm) { return fa #endif /* CONFIG_KVM_EXTERNAL_WRITE_TRACKING */ -static inline void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) +static inline void kvm_page_track_write_begin(struct kvm_vcpu *vcpu, gpa_t gpa, + int bytes) { + struct kvm *kvm = vcpu->kvm; + gfn_t gfn = gpa_to_gfn(gpa); + + WARN_ON(gfn != gpa_to_gfn(gpa + bytes - 1)); + + if (!kvm_page_track_write_tracking_enabled(kvm)) + return; + +retry: + if (xa_insert(&kvm->arch.track_writing_range, gfn, xa_mk_value(gfn), + GFP_KERNEL_ACCOUNT)) { + cpu_relax(); + goto retry; + } + return; +} + +static inline void kvm_page_track_write_abort(struct kvm_vcpu *vcpu, gpa_t gpa, + int bytes) +{ + if (!kvm_page_track_write_tracking_enabled(vcpu->kvm)) + return; + + xa_erase(&vcpu->kvm->arch.track_writing_range, gpa_to_gfn(gpa)); +} + +static inline void kvm_page_track_write_end(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + if (!kvm_page_track_write_tracking_enabled(vcpu->kvm)) + return; + __kvm_page_track_write(vcpu->kvm, gpa, new, bytes); kvm_mmu_track_write(vcpu, gpa, new, bytes); + + xa_erase(&vcpu->kvm->arch.track_writing_range, gpa_to_gfn(gpa)); } #endif /* __KVM_X86_PAGE_TRACK_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05a68d7d99fe..9b75829d5d7a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7544,10 +7544,13 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { int ret; + kvm_page_track_write_begin(vcpu, gpa, bytes); ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes); - if (ret < 0) + if (ret < 0) { + kvm_page_track_write_abort(vcpu, gpa, bytes); return 0; - kvm_page_track_write(vcpu, gpa, val, bytes); + } + kvm_page_track_write_end(vcpu, gpa, val, bytes); return 1; } @@ -7792,6 +7795,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, hva += offset_in_page(gpa); + kvm_page_track_write_begin(vcpu, gpa, bytes); switch (bytes) { case 1: r = emulator_try_cmpxchg_user(u8, hva, old, new); @@ -7809,12 +7813,16 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, BUG(); } - if (r < 0) + if (r < 0) { + kvm_page_track_write_abort(vcpu, gpa, bytes); return X86EMUL_UNHANDLEABLE; - if (r) + } + if (r) { + kvm_page_track_write_abort(vcpu, gpa, bytes); return X86EMUL_CMPXCHG_FAILED; + } - kvm_page_track_write(vcpu, gpa, new, bytes); + kvm_page_track_write_end(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE;