[PATCH RFC] KVM: ARM: Change pgd_mutex to spinlock

Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> · Wed, 8 Aug 2012 01:02:57 -0400

Requires pre-allocating page table memory.  We steal the
mmu_memory_cache implementation from x86 and adapt for our specific
usage.

Also removes taking the slots_lock in hva_to_gpa as this lock is only
for synchronizing writers and the read is performed under rcu_read_lock.

Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx>

I am merging this and pushing it out to avoid debugging errors on the
v10-stage branch, but comments are of course still welcome.

---
 arch/arm/include/asm/kvm_host.h |   16 +++++-
 arch/arm/include/asm/kvm_mmu.h  |    2 +
 arch/arm/kvm/arm.c              |    3 +-
 arch/arm/kvm/mmu.c              |  103 ++++++++++++++++++++++++++-------------
 4 files changed, 87 insertions(+), 37 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index bb924c0..088725e 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -45,7 +45,7 @@ struct kvm_arch {
 	u32    vmid;

 	/* 1-level 2nd stage table and lock */
-	struct mutex pgd_mutex;
+	spinlock_t pgd_lock;
 	pgd_t *pgd;

 	/* VTTBR value associated with above pgd and vmid */
@@ -62,6 +62,17 @@ struct kvm_arch {
 #define EXCEPTION_IRQ       0x02
 #define EXCEPTION_FIQ       0x01

+#define KVM_NR_MEM_OBJS     40
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+	int nobjs;
+	void *objects[KVM_NR_MEM_OBJS];
+};
+
 struct kvm_vcpu_regs {
 	u32 usr_regs[15];	/* R0_usr - R14_usr */
 	u32 svc_regs[3];	/* SP_svc, LR_svc, SPSR_svc */
@@ -143,6 +154,9 @@ struct kvm_vcpu_arch {

 	/* Hyp exception information */
 	u32 hyp_pc;		/* PC when exception was taken from Hyp mode */
+
+	/* Cache some mmu pages needed inside spinlock regions */
+	struct kvm_mmu_memory_cache mmu_page_cache;
 };

 struct kvm_vm_stat {
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index c59ecf9..c3f90b0 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -41,4 +41,6 @@ int kvm_phys_addr_ioremap(struct kvm *kvm,
phys_addr_t guest_ipa,
 int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
 int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);

+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index b390d7a..390e8b8 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -108,7 +108,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	ret = kvm_alloc_stage2_pgd(kvm);
 	if (ret)
 		goto out_fail_alloc;
-	mutex_init(&kvm->arch.pgd_mutex);
+	spin_lock_init(&kvm->arch.pgd_lock);

 	ret = create_hyp_mappings(kvm, kvm + 1);
 	if (ret)
@@ -239,6 +239,7 @@ out:

 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
+	kvm_mmu_free_memory_caches(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }

diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1b9027f..30d0cfc 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -32,6 +32,38 @@

 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);

+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+				  int min, int max)
+{
+	void *page;
+
+	BUG_ON(max > KVM_NR_MEM_OBJS);
+	if (cache->nobjs >= min)
+		return 0;
+	while (cache->nobjs < max) {
+		page = (void *)__get_free_page(PGALLOC_GFP);
+		if (!page)
+			return -ENOMEM;
+		cache->objects[cache->nobjs++] = page;
+	}
+	return 0;
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+	while (mc->nobjs)
+		free_page((unsigned long)mc->objects[--mc->nobjs]);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
+{
+	void *p;
+
+	BUG_ON(!mc || !mc->nobjs);
+	p = mc->objects[--mc->nobjs];
+	return p;
+}
+
 static void free_ptes(pmd_t *pmd, unsigned long addr)
 {
 	pte_t *pte;
@@ -384,8 +416,8 @@ static void stage2_clear_pte(struct kvm *kvm,
phys_addr_t addr)
 	put_page(page);
 }

-static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr,
-			  const pte_t *new_pte)
+static void stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+			   phys_addr_t addr, const pte_t *new_pte)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -396,11 +428,9 @@ static int stage2_set_pte(struct kvm *kvm,
phys_addr_t addr,
 	pgd = kvm->arch.pgd + pgd_index(addr);
 	pud = pud_offset(pgd, addr);
 	if (pud_none(*pud)) {
-		pmd = pmd_alloc_one(NULL, addr);
-		if (!pmd) {
-			kvm_err("Cannot allocate 2nd stage pmd\n");
-			return -ENOMEM;
-		}
+		if (!cache)
+			return; /* ignore calls from kvm_set_spte_hva */
+		pmd = mmu_memory_cache_alloc(cache);
 		pud_populate(NULL, pud, pmd);
 		pmd += pmd_index(addr);
 		get_page(virt_to_page(pud));
@@ -409,11 +439,10 @@ static int stage2_set_pte(struct kvm *kvm,
phys_addr_t addr,

 	/* Create 2nd stage page table mapping - Level 2 */
 	if (pmd_none(*pmd)) {
-		pte = pte_alloc_one_kernel(NULL, addr);
-		if (!pte) {
-			kvm_err("Cannot allocate 2nd stage pte\n");
-			return -ENOMEM;
-		}
+		if (!cache)
+			return; /* ignore calls from kvm_set_spte_hva */
+		pte = mmu_memory_cache_alloc(cache);
+		clean_pte_table(pte);
 		pmd_populate_kernel(NULL, pmd, pte);
 		pte += pte_index(addr);
 		get_page(virt_to_page(pmd));
@@ -424,8 +453,6 @@ static int stage2_set_pte(struct kvm *kvm, phys_addr_t addr,
 	BUG_ON(pte_none(pte));
 	set_pte_ext(pte, *new_pte, 0);
 	get_page(virt_to_page(pte));
-
-	return 0;
 }

 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -436,6 +463,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
phys_addr_t fault_ipa,
 	pfn_t pfn;
 	int ret;
 	bool write_fault, writable;
+	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;

 	/* TODO: Use instr. decoding for non-ISV to determine r/w fault */
 	if (is_iabt)
@@ -460,14 +488,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu,
phys_addr_t fault_ipa,
 		return -EFAULT;
 	}

-	mutex_lock(&vcpu->kvm->arch.pgd_mutex);
+	/* We need minimum second+third level pages */
+	ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS);
+	if (ret)
+		return ret;
 	new_pte = pfn_pte(pfn, PAGE_KVM_GUEST);
 	if (writable)
 		new_pte |= L_PTE2_WRITE;
-	ret = stage2_set_pte(vcpu->kvm, fault_ipa, &new_pte);
-	if (ret)
-		put_page(pfn_to_page(pfn));
-	mutex_unlock(&vcpu->kvm->arch.pgd_mutex);
+	spin_lock(&vcpu->kvm->arch.pgd_lock);
+	stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte);
+	spin_unlock(&vcpu->kvm->arch.pgd_lock);

 	return ret;
 }
@@ -487,24 +517,29 @@ int kvm_phys_addr_ioremap(struct kvm *kvm,
phys_addr_t guest_ipa,
 	pgprot_t prot;
 	int ret = 0;
 	unsigned long pfn;
+	struct kvm_mmu_memory_cache cache;

 	end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
 	prot = __pgprot(get_mem_type_prot_pte(MT_DEVICE) | L_PTE_USER |
 			L_PTE2_READ | L_PTE2_WRITE);
 	pfn = __phys_to_pfn(pa);

-	mutex_lock(&kvm->arch.pgd_mutex);
+
 	for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
 		pte_t pte = pfn_pte(pfn, prot);

-		ret = stage2_set_pte(kvm, addr, &pte);
+		ret = mmu_topup_memory_cache(&cache, 2, 2);
 		if (ret)
-			break;
+			goto out;
+		spin_lock(&kvm->arch.pgd_lock);
+		stage2_set_pte(kvm, &cache, addr, &pte);
+		spin_unlock(&kvm->arch.pgd_lock);

 		pfn++;
 	}
-	mutex_unlock(&kvm->arch.pgd_mutex);

+out:
+	mmu_free_memory_cache(&cache);
 	return ret;
 }

@@ -725,7 +760,6 @@ static bool hva_to_gpa(struct kvm *kvm, unsigned
long hva, gpa_t *gpa)
 	struct kvm_memory_slot *memslot;
 	bool found = false;

-	mutex_lock(&kvm->slots_lock);
 	slots = kvm_memslots(kvm);

 	/* we only care about the pages that the guest sees */
@@ -743,7 +777,6 @@ static bool hva_to_gpa(struct kvm *kvm, unsigned
long hva, gpa_t *gpa)
 		}
 	}

-	mutex_unlock(&kvm->slots_lock);
 	return found;
 }

@@ -755,12 +788,12 @@ int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 	if (!kvm->arch.pgd)
 		return 0;

-	mutex_lock(&kvm->arch.pgd_mutex);
 	found = hva_to_gpa(kvm, hva, &gpa);
 	if (found) {
+		spin_lock(&kvm->arch.pgd_lock);
 		stage2_clear_pte(kvm, gpa);
+		spin_unlock(&kvm->arch.pgd_lock);
 	}
-	mutex_unlock(&kvm->arch.pgd_mutex);
 	return 0;
 }

@@ -772,16 +805,16 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned
long hva, pte_t pte)
 	if (!kvm->arch.pgd)
 		return;

-	mutex_lock(&kvm->arch.pgd_mutex);
 	found = hva_to_gpa(kvm, hva, &gpa);
 	if (found) {
-		stage2_set_pte(kvm, gpa, &pte);
-		/*
-		 * Ignore return code from stage2_set_pte, since -ENOMEM would
-		 * indicate this IPA is is not mapped and there is no harm
-		 * that the PTE changed.
-		 */
+		spin_lock(&kvm->arch.pgd_lock);
+		stage2_set_pte(kvm, NULL, gpa, &pte);
+		spin_unlock(&kvm->arch.pgd_lock);
 		__kvm_tlb_flush_vmid(kvm);
 	}
-	mutex_unlock(&kvm->arch.pgd_mutex);
+}
+
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+	mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
 }
-- 
1.7.9.5
_______________________________________________
kvmarm mailing list
kvmarm@xxxxxxxxxxxxxxxxxxxxx
https://lists.cs.columbia.edu/cucslists/listinfo/kvmarm