On Tue, 2024-08-06 at 08:57 -0700, Sean Christopherson wrote: > The last one is key: the pages have already been freed when invalidate_range_end() > is called, and so unmapping at that time would be too late. OK, thanks. > The obvious downside is what you've run into, where the start+end approach forces > KVM to wait for all in-flight invalidations to go away. But again, in practice > the rudimentary range tracking suffices for all known use cases. Makes sense. > I suspect you're trying to solve a problem that doesn't exist in practice. > hva_to_pfn_retry() already has a cond_resched(), so getting stuck for a long > duration isn't fatal, just suboptimal. And similar to the range-based tracking, > _if_ there's a problem in practice, then it also affects guest page faults. KVM > simply resumes the vCPU and keeps re-faulting until the in-flight invalidation(s) > has gone away. Yeah. When it's looping on actual page faults it's easy to pretend it isn't a busy-loop :) Making it wait is fairly simple; it would be easy to just make it cond_resched() instead though. Here's what I have so far (incremental). https://git.infradead.org/?p=users/dwmw2/linux.git;a=commitdiff;h=pfncache-2 I need to revive the torture test you had at the end of your earlier series. diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 689e8be873a7..9b5261e11802 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -770,6 +770,9 @@ struct kvm { /* For management / invalidation of gfn_to_pfn_caches */ spinlock_t gpc_lock; struct list_head gpc_list; + u64 mmu_gpc_invalidate_range_start; + u64 mmu_gpc_invalidate_range_end; + wait_queue_head_t gpc_invalidate_wq; /* * created_vcpus is protected by kvm->lock, and is incremented diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ffd6ab4c2a16..a243f9f8a9c0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, */ spin_lock(&kvm->mn_invalidate_lock); kvm->mn_active_invalidate_count++; + if (likely(kvm->mn_active_invalidate_count == 1) { + kvm->mmu_gpc_invalidate_range_start = range->start; + kvm->mmu_gpc_invalidate_range_end = range->end; + } else { + /* + * Fully tracking multiple concurrent ranges has diminishing + * returns. Keep things simple and just find the minimal range + * which includes the current and new ranges. As there won't be + * enough information to subtract a range after its invalidate + * completes, any ranges invalidated concurrently will + * accumulate and persist until all outstanding invalidates + * complete. + */ + kvm->mmu_gpc_invalidate_range_start = + min(kvm->mmu_gpc_invalidate_range_start, range->start); + kvm->mmu_gpc_invalidate_range_end = + max(kvm->mmu_gpc_invalidate_range_end, range->end); + } spin_unlock(&kvm->mn_invalidate_lock); /* @@ -830,21 +848,27 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, __kvm_handle_hva_range(kvm, &hva_range); + gfn_to_pfn_cache_invalidate(kvm, range->start, range->end); + /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count)) --kvm->mn_active_invalidate_count; wake = !kvm->mn_active_invalidate_count; + if (wake) { + kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD; + kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD; + } spin_unlock(&kvm->mn_invalidate_lock); - gfn_to_pfn_cache_invalidate(kvm, range->start, range->end); - /* * There can only be one waiter, since the wait happens under * slots_lock. */ - if (wake) + if (wake) { + wake_up(&kvm->gpc_invalidate_wq); rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); + } } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, @@ -1154,7 +1178,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) INIT_LIST_HEAD(&kvm->gpc_list); spin_lock_init(&kvm->gpc_lock); - + init_waitqueue_head(&kvm->gpc_invalidate_wq); + kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD; + kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD; INIT_LIST_HEAD(&kvm->devices); kvm->max_vcpus = KVM_MAX_VCPUS; diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 187b58150ef7..dad32ef5ac87 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -133,6 +133,39 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva) #endif } +static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc) +{ + /* + * No need for locking on GPC here because these fields are protected + * by gpc->refresh_lock. + */ + return unlikely(gpc->kvm->mn_active_invalidate_count) && + (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) && + (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva); +} + +static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc) +{ + spin_lock(&gpc->kvm->mn_invalidate_lock); + if (gpc_invalidations_pending(gpc)) { + DECLARE_WAITQUEUE(wait, current); + + for (;;) { + prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait, + TASK_UNINTERRUPTIBLE); + + if (!gpc_invalidations_pending(gpc)) + break; + + spin_unlock(&gpc->kvm->mn_invalidate_lock); + schedule(); + spin_lock(&gpc->kvm->mn_invalidate_lock); + } + finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait); + } + spin_unlock(&gpc->kvm->mn_invalidate_lock); +} + static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) { /* Note, the new page offset may be different than the old! */ @@ -205,6 +238,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) goto out_error; } + /* + * Avoid populating a GPC with a user address which is already + * being invalidated, if the invalidate_range_start() notifier + * has already been called. The actual invalidation happens in + * the invalidate_range_end() callback, so wait until there are + * no active invalidations (for the relevant HVA). + */ + gpc_wait_for_invalidations(gpc); + write_lock_irq(&gpc->lock); /*
Attachment:
smime.p7s
Description: S/MIME cryptographic signature