[PATCH] kvm: mmu: Fix flushing in kvm_zap_gfn_range

Ben Gardon <bgardon@xxxxxxxxxx> · Tue, 12 Mar 2019 09:17:27 -0700

zap_direct_gfn_range contains an optimization to use gfn-limited TLB
flushes, if enabled. If using these limited flushes,
zap_direct_gfn_range passes lock_flush_tlb=false to
slot_handle_level_range which creates a race when the function unlocks
to call cond_resched. See an example of this race below:

CPU 0			CPU 1				CPU 3
// zap_direct_gfn_range
mmu_lock()
// *ptep == pte_1
*ptep = 0
if (lock_flush_tlb)
	flush_tlbs()
mmu_unlock()
			// In invalidate range
			// MMU notifier
			mmu_lock()
			if (pte != 0)
				*ptep = 0
				flush = true
			if (flush)
				flush_remote_tlbs()
			mmu_unlock()
			return
			// Host MM reallocates
			// page previously
			// backing guest memory.
							// Guest accesses
							// invalid page
							// through pte_1
							// in its TLB!!

Avoid the above race by passing lock_flush_tlb=true from kvm_zap_gfn_range
and replacing kvm_flush_remote_tlbs with kvm_flush_remote_tlbs_with_address
in slot_handle_level_range. When range based flushes are not enabled
kvm_flush_remote_tlbs_with_address falls back to kvm_flush_remote_tlbs.

This does change the behavior of many other functions that indirectly use
slot_handle_level_range, iff the range based flushes are enabled. The
only potential problem I see with this is that kvm->tlbs_dirty will be
cleared less often, however the only caller of slot_handle_level_range that
checks tlbs_dirty is kvm_mmu_notifier_invalidate_range_start which
checks it and does a kvm_flush_remote_tlbs after calling
kvm_unmap_hva_range anyway.

Tested: Ran all kvm-unit-tests on a Intel Haswell machine with and
	without this patch. The patch introduced no new failures.

Signed-off-by: Ben Gardon <bgardon@xxxxxxxxxx>
Reviewed-by: Junaid Shadid <junaids@xxxxxxxxxx>
---
 arch/x86/kvm/mmu.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8d43b7c0f56fd..2e6abfa47a2e4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5501,7 +5501,9 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
 		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 			if (flush && lock_flush_tlb) {
-				kvm_flush_remote_tlbs(kvm);
+				kvm_flush_remote_tlbs_with_address(kvm,
+						start_gfn,
+						iterator.gfn - start_gfn + 1);
 				flush = false;
 			}
 			cond_resched_lock(&kvm->mmu_lock);
@@ -5509,7 +5511,8 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	}
 
 	if (flush && lock_flush_tlb) {
-		kvm_flush_remote_tlbs(kvm);
+		kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
+						   end_gfn - start_gfn + 1);
 		flush = false;
 	}
 
@@ -5660,13 +5663,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	bool flush_tlb = true;
 	bool flush = false;
 	int i;
 
-	if (kvm_available_flush_tlb_with_range())
-		flush_tlb = false;
-
 	spin_lock(&kvm->mmu_lock);
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		slots = __kvm_memslots(kvm, i);
@@ -5681,14 +5680,10 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 			flush |= slot_handle_level_range(kvm, memslot,
 					kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL,
 					PT_MAX_HUGEPAGE_LEVEL, start,
-					end - 1, flush_tlb);
+					end - 1, true);
 		}
 	}
 
-	if (flush)
-		kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
-				gfn_end - gfn_start + 1);
-
 	spin_unlock(&kvm->mmu_lock);
 }
 
-- 
2.21.0.360.g471c308f928-goog