[PATCH] kvm: x86/mmu: Simplify pte_list_{add|remove}

Lai Jiangshan <jiangshanlai@xxxxxxxxx> · Fri, 13 Jan 2023 20:29:10 +0800

From: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>

Simplify pte_list_{add|remove} by ensuring all the non-head pte_list_desc
to be full and addition/removal actions being performed on the head.

To make pte_list_add() return a count as before, @tail_count is also
added to the struct pte_list_desc.

No visible performace is changed in tests.  But pte_list_add() is no longer
shown in the perf result for the COWed pages even the guest forks millions
of tasks.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@xxxxxxxxxxxx>
---
 arch/x86/kvm/mmu/mmu.c | 77 +++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 55c9fcd6ed4f..fc64f5f0822f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -131,10 +131,16 @@ module_param(dbg, bool, 0644);
 struct pte_list_desc {
 	struct pte_list_desc *more;
 	/*
-	 * Stores number of entries stored in the pte_list_desc.  No need to be
-	 * u64 but just for easier alignment.  When PTE_LIST_EXT, means full.
+	 * @spte_count: Stores number of entries stored in the pte_list_desc.
+	 * When PTE_LIST_EXT, means full.  All the non-head pte_list_desc must
+	 * be full.
+	 *
+	 * @tail_count: Stores number of entries stored in its tail descriptions.
+	 *
+	 * No need to be u32 but just for easier alignment.
 	 */
-	u64 spte_count;
+	u32 spte_count;
+	u32 tail_count;
 	u64 *sptes[PTE_LIST_EXT];
 };
 
@@ -917,22 +923,25 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
 		desc->sptes[0] = (u64 *)rmap_head->val;
 		desc->sptes[1] = spte;
 		desc->spte_count = 2;
+		desc->tail_count = 0;
 		rmap_head->val = (unsigned long)desc | 1;
 		++count;
 	} else {
 		rmap_printk("%p %llx many->many\n", spte, *spte);
 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-		while (desc->spte_count == PTE_LIST_EXT) {
-			count += PTE_LIST_EXT;
-			if (!desc->more) {
-				desc->more = kvm_mmu_memory_cache_alloc(cache);
-				desc = desc->more;
-				desc->spte_count = 0;
-				break;
-			}
-			desc = desc->more;
+		count = desc->tail_count + desc->spte_count;
+		/*
+		 * When the head pte_list_desc is full, the whole list must
+		 * be full since all the non-head pte_list_desc are full.
+		 * So just allocate a new head.
+		 */
+		if (desc->spte_count == PTE_LIST_EXT) {
+			desc = kvm_mmu_memory_cache_alloc(cache);
+			desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+			desc->spte_count = 0;
+			desc->tail_count = count;
+			rmap_head->val = (unsigned long)desc | 1;
 		}
-		count += desc->spte_count;
 		desc->sptes[desc->spte_count++] = spte;
 	}
 	return count;
@@ -940,30 +949,30 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
 
 static void
 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
-			   struct pte_list_desc *desc, int i,
-			   struct pte_list_desc *prev_desc)
+			   struct pte_list_desc *desc, int i)
 {
-	int j = desc->spte_count - 1;
+	struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+	int j = head_desc->spte_count - 1;
 
-	desc->sptes[i] = desc->sptes[j];
-	desc->sptes[j] = NULL;
-	desc->spte_count--;
-	if (desc->spte_count)
+	/*
+	 * Grab an entry from the head pte_list_desc to ensure that
+	 * the non-head pte_list_desc are full.
+	 */
+	desc->sptes[i] = head_desc->sptes[j];
+	head_desc->sptes[j] = NULL;
+	head_desc->spte_count--;
+	if (head_desc->spte_count)
 		return;
-	if (!prev_desc && !desc->more)
+	if (!head_desc->more)
 		rmap_head->val = 0;
 	else
-		if (prev_desc)
-			prev_desc->more = desc->more;
-		else
-			rmap_head->val = (unsigned long)desc->more | 1;
-	mmu_free_pte_list_desc(desc);
+		rmap_head->val = (unsigned long)head_desc->more | 1;
+	mmu_free_pte_list_desc(head_desc);
 }
 
 static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
 {
 	struct pte_list_desc *desc;
-	struct pte_list_desc *prev_desc;
 	int i;
 
 	if (!rmap_head->val) {
@@ -979,16 +988,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
 	} else {
 		rmap_printk("%p many->many\n", spte);
 		desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-		prev_desc = NULL;
 		while (desc) {
 			for (i = 0; i < desc->spte_count; ++i) {
 				if (desc->sptes[i] == spte) {
-					pte_list_desc_remove_entry(rmap_head,
-							desc, i, prev_desc);
+					pte_list_desc_remove_entry(rmap_head, desc, i);
 					return;
 				}
 			}
-			prev_desc = desc;
 			desc = desc->more;
 		}
 		pr_err("%s: %p many->many\n", __func__, spte);
@@ -1035,7 +1041,6 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
 {
 	struct pte_list_desc *desc;
-	unsigned int count = 0;
 
 	if (!rmap_head->val)
 		return 0;
@@ -1043,13 +1048,7 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
 		return 1;
 
 	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-
-	while (desc) {
-		count += desc->spte_count;
-		desc = desc->more;
-	}
-
-	return count;
+	return desc->tail_count + desc->spte_count;
 }
 
 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
-- 
2.19.1.6.gb485710b