From: David Woodhouse <dwmw@xxxxxxxxxxxx> The existing retry loop in hva_to_pfn_retry() is extremely pessimistic. If there are any concurrent invalidations running, it's effectively just a complex busy wait loop because its local mmu_notifier_retry_cache() function will always return true. Since multiple invalidations can be running in parallel, this can result in a situation where hva_to_pfn_retry() just backs off and keep retrying for ever, not making any progress. Solve this by being a bit more selective about when to retry. In addition to the ->needs_invalidation flag added in a previous commit, check before calling hva_to_pfn() if there is any pending invalidation (i.e. between invalidate_range_start() and invalidate_range_end()) which affects the uHVA of the GPC being updated. This catches the case where hva_to_pfn() would return a soon-to-be-stale mapping of a range for which the invalidate_range_start() hook had already been called before the uHVA was even set in the GPC and the ->needs_invalidation flag set. An invalidation which *starts* after the lock is dropped in the loop is fine because it will clear the ->needs_revalidation flag and also trigger a retry. This is slightly more complex than it needs to be; moving the invalidation to occur in the invalidate_range_end() hook will make life simpler, in a subsequent commit. Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx> --- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 20 ++++++++++++++ virt/kvm/pfncache.c | 60 +++++++++++++++++++++------------------- 3 files changed, 54 insertions(+), 28 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 79a6b1a63027..1bfe2e8d52cd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -770,6 +770,8 @@ struct kvm { /* For management / invalidation of gfn_to_pfn_caches */ spinlock_t gpc_lock; struct list_head gpc_list; + u64 mmu_gpc_invalidate_range_start; + u64 mmu_gpc_invalidate_range_end; /* * created_vcpus is protected by kvm->lock, and is incremented diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 92901656a0d4..84eb1ebb6f47 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, */ spin_lock(&kvm->mn_invalidate_lock); kvm->mn_active_invalidate_count++; + if (likely(kvm->mn_active_invalidate_count == 1)) { + kvm->mmu_gpc_invalidate_range_start = range->start; + kvm->mmu_gpc_invalidate_range_end = range->end; + } else { + /* + * Fully tracking multiple concurrent ranges has diminishing + * returns. Keep things simple and just find the minimal range + * which includes the current and new ranges. As there won't be + * enough information to subtract a range after its invalidate + * completes, any ranges invalidated concurrently will + * accumulate and persist until all outstanding invalidates + * complete. + */ + kvm->mmu_gpc_invalidate_range_start = + min(kvm->mmu_gpc_invalidate_range_start, range->start); + kvm->mmu_gpc_invalidate_range_end = + max(kvm->mmu_gpc_invalidate_range_end, range->end); + } spin_unlock(&kvm->mn_invalidate_lock); /* @@ -1164,6 +1182,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) INIT_LIST_HEAD(&kvm->gpc_list); spin_lock_init(&kvm->gpc_lock); + kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD; + kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD; INIT_LIST_HEAD(&kvm->devices); kvm->max_vcpus = KVM_MAX_VCPUS; diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 7007d32d197a..eeb9bf43c04a 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -129,32 +129,17 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva) #endif } -static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq) +static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc) { /* - * mn_active_invalidate_count acts for all intents and purposes - * like mmu_invalidate_in_progress here; but the latter cannot - * be used here because the invalidation of caches in the - * mmu_notifier event occurs _before_ mmu_invalidate_in_progress - * is elevated. - * - * Note, it does not matter that mn_active_invalidate_count - * is not protected by gpc->lock. It is guaranteed to - * be elevated before the mmu_notifier acquires gpc->lock, and - * isn't dropped until after mmu_invalidate_seq is updated. + * No need for locking on GPC here because these fields are protected + * by gpc->refresh_lock. */ - if (kvm->mn_active_invalidate_count) - return true; + guard(spinlock)(&gpc->kvm->mn_invalidate_lock); - /* - * Ensure mn_active_invalidate_count is read before - * mmu_invalidate_seq. This pairs with the smp_wmb() in - * mmu_notifier_invalidate_range_end() to guarantee either the - * old (non-zero) value of mn_active_invalidate_count or the - * new (incremented) value of mmu_invalidate_seq is observed. - */ - smp_rmb(); - return kvm->mmu_invalidate_seq != mmu_seq; + return unlikely(gpc->kvm->mn_active_invalidate_count) && + (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) && + (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva); } static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) @@ -163,7 +148,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; void *new_khva = NULL; - unsigned long mmu_seq; lockdep_assert_held(&gpc->refresh_lock); @@ -177,9 +161,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) gpc->valid = false; do { - mmu_seq = gpc->kvm->mmu_invalidate_seq; - smp_rmb(); - /* * The translation made by hva_to_pfn() below could be made * invalid as soon as it's mapped. But the uhva is already @@ -193,6 +174,29 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) write_unlock_irq(&gpc->lock); + /* + * Invalidation occurs from the invalidate_range_start() hook, + * which could already have happened before __kvm_gpc_refresh() + * (or the previous turn around this loop) took gpc->lock(). + * If so, and if the corresponding invalidate_range_end() hook + * hasn't happened yet, hva_to_pfn() could return a mapping + * which is about to be stale and which should not be used. So + * check if there are any currently-running invalidations which + * affect the uHVA of this GPC, and retry if there are. Any + * invalidation which starts after gpc->needs_invalidation is + * set is fine, because it will clear that flag and trigger a + * retry. And any invalidation which *completes* by having its + * invalidate_range_end() hook called immediately prior to this + * check is also fine, because the page tables are guaranteed + * to have been changted already, so hva_to_pfn() won't return + * a stale mapping in that case anyway. + */ + while (gpc_invalidations_pending(gpc)) { + cond_resched(); + write_lock_irq(&gpc->lock); + continue; + } + /* * If the previous iteration "failed" due to an mmu_notifier * event, release the pfn and unmap the kernel virtual address @@ -233,6 +237,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) goto out_error; } + write_lock_irq(&gpc->lock); /* @@ -240,8 +245,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) * attempting to refresh. */ WARN_ON_ONCE(gpc->valid); - } while (!gpc->needs_invalidation || - mmu_notifier_retry_cache(gpc->kvm, mmu_seq)); + } while (!gpc->needs_invalidation); gpc->valid = true; gpc->pfn = new_pfn; -- 2.44.0