Re: [PATCH v10 29/59] KVM: arm64: nv: Unmap/flush shadow stage 2 page tables

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




Hi Marc,

On 15-05-2023 11:00 pm, Marc Zyngier wrote:
From: Christoffer Dall <christoffer.dall@xxxxxxxxxx>

Unmap/flush shadow stage 2 page tables for the nested VMs as well as the
stage 2 page table for the guest hypervisor.

Note: A bunch of the code in mmu.c relating to MMU notifiers is
currently dealt with in an extremely abrupt way, for example by clearing
out an entire shadow stage-2 table. This will be handled in a more
efficient way using the reverse mapping feature in a later version of
the patch series.

We are seeing spin-lock contention due to this patch when the Guest-Hypervisor(L1) is booted with higher number of cores and auto-numa is enabled on L0. kvm_nested_s2_unmap is called as part of notifier call-back when numa page migration is happening and this function which holds lock becomes source of contention when there are more vCPUs are processing the auto-numa page fault/migration.

Signed-off-by: Christoffer Dall <christoffer.dall@xxxxxxxxxx>
Signed-off-by: Jintack Lim <jintack.lim@xxxxxxxxxx>
Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx>
---
  arch/arm64/include/asm/kvm_mmu.h    |  3 +++
  arch/arm64/include/asm/kvm_nested.h |  3 +++
  arch/arm64/kvm/mmu.c                | 30 ++++++++++++++++++----
  arch/arm64/kvm/nested.c             | 39 +++++++++++++++++++++++++++++
  4 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 896acdf98e71..d155b3871c4c 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -169,6 +169,8 @@ int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
  			   void __iomem **haddr);
  int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
  			     void **haddr);
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu,
+			    phys_addr_t addr, phys_addr_t end);
  void __init free_hyp_pgds(void);
void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size);
@@ -177,6 +179,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
  void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
  			  phys_addr_t pa, unsigned long size, bool writable);
+void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index f20d272fcd6d..d330b947d48a 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -111,6 +111,9 @@ extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
  extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
  				    struct kvm_s2_trans *trans);
  extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
+extern void kvm_nested_s2_wp(struct kvm *kvm);
+extern void kvm_nested_s2_unmap(struct kvm *kvm);
+extern void kvm_nested_s2_flush(struct kvm *kvm);
  int handle_wfx_nested(struct kvm_vcpu *vcpu, bool is_wfe);
extern bool forward_smc_trap(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1e19c59b8235..8144bb9b9ec8 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -245,13 +245,19 @@ void kvm_unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
  	__unmap_stage2_range(mmu, start, size, true);
  }
+void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu,
+			    phys_addr_t addr, phys_addr_t end)
+{
+	stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_flush);
+}
+
  static void stage2_flush_memslot(struct kvm *kvm,
  				 struct kvm_memory_slot *memslot)
  {
  	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
  	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush);
+	kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
  }
/**
@@ -274,6 +280,8 @@ static void stage2_flush_vm(struct kvm *kvm)
  	kvm_for_each_memslot(memslot, bkt, slots)
  		stage2_flush_memslot(kvm, memslot);
+ kvm_nested_s2_flush(kvm);
+
  	write_unlock(&kvm->mmu_lock);
  	srcu_read_unlock(&kvm->srcu, idx);
  }
@@ -888,6 +896,8 @@ void stage2_unmap_vm(struct kvm *kvm)
  	kvm_for_each_memslot(memslot, bkt, slots)
  		stage2_unmap_memslot(kvm, memslot);
+ kvm_nested_s2_unmap(kvm);
+
  	write_unlock(&kvm->mmu_lock);
  	mmap_read_unlock(current->mm);
  	srcu_read_unlock(&kvm->srcu, idx);
@@ -987,12 +997,12 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
  }
/**
- * stage2_wp_range() - write protect stage2 memory region range
+ * kvm_stage2_wp_range() - write protect stage2 memory region range
   * @mmu:        The KVM stage-2 MMU pointer
   * @addr:	Start address of range
   * @end:	End address of range
   */
-static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
+void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
  {
  	stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect);
  }
@@ -1023,7 +1033,8 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
  	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
write_lock(&kvm->mmu_lock);
-	stage2_wp_range(&kvm->arch.mmu, start, end);
+	kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
+	kvm_nested_s2_wp(kvm);
  	write_unlock(&kvm->mmu_lock);
  	kvm_flush_remote_tlbs(kvm);
  }
@@ -1047,7 +1058,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
  	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
  	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
- stage2_wp_range(&kvm->arch.mmu, start, end);
+	kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
  }
/*
@@ -1062,6 +1073,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
  		gfn_t gfn_offset, unsigned long mask)
  {
  	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+	kvm_nested_s2_wp(kvm);
  }
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
@@ -1720,6 +1732,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
  			     (range->end - range->start) << PAGE_SHIFT,
  			     range->may_block);
+ kvm_nested_s2_unmap(kvm);

This kvm_nested_s2_unmap/kvm_unmap_stage2_range is called for every active L2 and page table walk-through iterates for long iterations since kvm_phys_size(mmu) is pretty big size(atleast 48bits). What would be the best fix if we want to avoid this unnessary long iteration of page table lookup?

  	return false;
  }
@@ -1754,6 +1767,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
  			       PAGE_SIZE, __pfn_to_phys(pfn),
  			       KVM_PGTABLE_PROT_R, NULL, 0);
+ kvm_nested_s2_unmap(kvm);
  	return false;
  }
@@ -1772,6 +1786,11 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
  					range->start << PAGE_SHIFT);
  	pte = __pte(kpte);
  	return pte_valid(pte) && pte_young(pte);
+
+	/*
+	 * TODO: Handle nested_mmu structures here using the reverse mapping in
+	 * a later version of patch series.
+	 */
  }
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -2004,6 +2023,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
write_lock(&kvm->mmu_lock);
  	kvm_unmap_stage2_range(&kvm->arch.mmu, gpa, size);
+	kvm_nested_s2_unmap(kvm);
  	write_unlock(&kvm->mmu_lock);
  }
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 73c0be25345a..948ac5b9638c 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -523,6 +523,45 @@ int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
  	return kvm_inject_nested_sync(vcpu, esr_el2);
  }
+/* expects kvm->mmu_lock to be held */
+void kvm_nested_s2_wp(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+		if (kvm_s2_mmu_valid(mmu))
+			kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
+	}
+}
+
+/* expects kvm->mmu_lock to be held */
+void kvm_nested_s2_unmap(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+		if (kvm_s2_mmu_valid(mmu))
+			kvm_unmap_stage2_range(mmu, 0, kvm_phys_size(mmu));
+	}
+}
+
+/* expects kvm->mmu_lock to be held */
+void kvm_nested_s2_flush(struct kvm *kvm)
+{
+	int i;
+
+	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
+		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
+
+		if (kvm_s2_mmu_valid(mmu))
+			kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
+	}
+}
+
  void kvm_arch_flush_shadow_all(struct kvm *kvm)
  {
  	int i;

Thanks,
Ganapat



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux