[RFC PATCH 009/008] kvm: gmem: Introduce "sharing refcount"

Patrick Roy <roypat@xxxxxxxxxxxx> · Fri, 9 Aug 2024 15:13:08 +0100

The assumption that there would never be two gfn_to_pfn_caches holding
the same gfn was wrong. The guest can put the kvm-clock structures for
different vCPUs into the same gfn. On KVM's side, one gfn_to_pfn_cache
is initialized by vCPU, meaning in multi-vCPU setups, multiple
gfn_to_pfn_caches to the same gfn exist.

For gmem, this means that multiple gfn_to_pfn_caches will want direct
map entries for the same page to be present - the direct map entry needs
to be removed when the first gpc is initialized, and can only be removed
again after the last gpc to this page is invalidated. To handle this,
introduce the concept of a "sharing refcount" to gmem: If something
inside of KVM wants to access gmem it should now do

struct folio *gmem_folio = /* ... */;
int r = kvm_gmem_folio_share(gmem_folio);
if (r)
    goto err;
/* do stuff */
kvm_gmem_folio_unshare(gmem_folio);

The first call to kvm_gmem_folio_share will increment this new "sharing
refcount" by 1 (and insert the folio back into the direct map if it
acquires the first refcount), while kvm_gmem_folio_unshare will
decrement the refcount by 1 (and remove the folio from the direct map
again if it held the last refcount).

One quirk is that we use "sharing_refcount == 1" to mean "folio is not
in the direct map" (aka not shared), as letting the refcount temporarily
drop to 0 would cause refcount_t functions to WARN.

Signed-off-by: Patrick Roy <roypat@xxxxxxxxxxxx>
---
 virt/kvm/guest_memfd.c | 139 ++++++++++++++++++++++++++++++++++++++---
 virt/kvm/kvm_main.c    |  32 ++++------
 virt/kvm/kvm_mm.h      |   2 +
 virt/kvm/pfncache.c    |  54 ++--------------
 4 files changed, 148 insertions(+), 79 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 29abbc883c73a..05fd6149c11c2 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -55,6 +55,96 @@ static bool kvm_gmem_not_present(struct inode *inode)
 	return ((unsigned long)inode->i_private & KVM_GMEM_NO_DIRECT_MAP) != 0;
 }

+static int kvm_gmem_folio_private(struct folio* folio)
+{
+	unsigned long nr_pages = folio_nr_pages(folio);
+	unsigned long i;
+	int r;
+
+	/*
+	 * We must only remove direct map entries after the last "sharing
+	 * reference" has gone away.
+	 */
+	if(WARN_ON_ONCE(refcount_read(folio_get_private(folio)) != 1))
+		return -EPERM;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page = folio_page(folio, i);
+
+		r = set_direct_map_invalid_noflush(page);
+		if (r < 0)
+			goto out_remap;
+	}
+
+	// We use the private flag to track whether the folio has been removed
+	// from the direct map. This is because inside of ->free_folio,
+	// we do not have access to the address_space anymore, meaning we
+	// cannot check folio_inode(folio)->i_private to determine whether
+	// KVM_GMEM_NO_DIRECT_MAP was set.
+	folio_set_private(folio);
+	return 0;
+out_remap:
+	for (; i > 0; i--) {
+		struct page *page = folio_page(folio, i - 1);
+
+		BUG_ON(set_direct_map_default_noflush(page));
+	}
+	return r;
+}
+
+static int kvm_gmem_folio_clear_private(struct folio *folio)
+{
+	unsigned long start = (unsigned long)folio_address(folio);
+	unsigned long nr = folio_nr_pages(folio);
+	unsigned long i;
+	int r;
+
+	/*
+	 * We must restore direct map entries on acquiring the first "sharing
+	 * reference" (although restoring it before that is fine too - we
+	 * restore direct map entries with sharing_refcount == 1 in
+	 * kvm_gmem_invalidate_folio).
+	 */
+	WARN_ON_ONCE(refcount_read(folio_get_private(folio)) > 2);
+
+	for (i = 0; i < nr; i++) {
+		struct page *page = folio_page(folio, i);
+
+		r = set_direct_map_default_noflush(page);
+		if (r)
+			goto out_remap;
+	}
+	flush_tlb_kernel_range(start, start + folio_size(folio));
+
+	folio_clear_private(folio);
+	return 0;
+out_remap:
+	for (; i > 0; i--) {
+		for (; i > 0; i--) {
+			struct page *page = folio_page(folio, i - 1);
+
+			BUG_ON(set_direct_map_invalid_noflush(page));
+		}
+	}
+	return r;
+}
+
+static int kvm_gmem_init_sharing_count(struct folio *folio)
+{
+	refcount_t *sharing_count = kmalloc(sizeof(*sharing_count), GFP_KERNEL);
+	if (!sharing_count)
+		return -ENOMEM;
+
+	/*
+	 * we need to use sharing_count == 1 to mean "no sharing", because dropping
+	 * a refcount_t to 0 and later inc-ing it again would result in a WARN
+	 */
+	refcount_set(sharing_count, 1);
+	folio_change_private(folio, (void *)sharing_count);
+
+	return 0;
+}
+
 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
 {
 	struct folio *folio;
@@ -96,16 +186,12 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool
 	}

 	if (zap_direct_map) {
-		r = set_direct_map_invalid_noflush(&folio->page);
+		r = kvm_gmem_init_sharing_count(folio);
+		if (r < 0)
+			goto out_err;
+		r = kvm_gmem_folio_private(folio);
 		if (r < 0)
 			goto out_err;
-
-		// We use the private flag to track whether the folio has been removed
-		// from the direct map. This is because inside of ->free_folio,
-		// we do not have access to the address_space anymore, meaning we
-		// cannot check folio_inode(folio)->i_private to determine whether
-		// KVM_GMEM_NO_DIRECT_MAP was set.
-		folio_set_private(folio);
 	}

 	/*
@@ -413,11 +499,21 @@ static void kvm_gmem_free_folio(struct folio *folio)
 static void kvm_gmem_invalidate_folio(struct folio *folio, size_t start, size_t end)
 {
 	if (start == 0 && end == PAGE_SIZE) {
+		refcount_t *sharing_count = folio_get_private(folio);
+		/*
+		 * sharing_count != 1 means that something else forgot
+		 * to call kvm_gmem_folio_unshare after it was done with the
+		 * folio (meaning the folio has been in the direct map
+		 * this entire time, which means we haven't been getting the
+		 * spculation protection we wanted).
+		 */
+		WARN_ON_ONCE(refcount_read(sharing_count) != 1);
+
 		// We only get here if PG_private is set, which only happens if kvm_gmem_not_present
 		// returned true in kvm_gmem_get_folio. Thus no need to do that check again.
-		BUG_ON(set_direct_map_default_noflush(&folio->page));
+		kvm_gmem_folio_clear_private(folio);
+		kfree(sharing_count);

-		folio_clear_private(folio);
 	}
 }

@@ -610,6 +706,29 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot)
 	fput(file);
 }

+int kvm_gmem_folio_unshare(struct folio *folio)
+{
+	if (kvm_gmem_not_present(folio_inode(folio))) {
+		refcount_t *sharing_count = folio_get_private(folio);
+
+		refcount_dec(sharing_count);
+		if (refcount_read(sharing_count) == 1)
+			return kvm_gmem_folio_private(folio);
+	}
+	return 0;
+}
+
+int kvm_gmem_folio_share(struct folio *folio)
+{
+	if (kvm_gmem_not_present(folio_inode(folio))) {
+		refcount_inc(folio_get_private(folio));
+
+		if (folio_test_private(folio))
+			return kvm_gmem_folio_clear_private(folio);
+	}
+	return 0;
+}
+
 static int __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
 		       gfn_t gfn, kvm_pfn_t *pfn, int *max_order, bool prepare)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 762decd9f2da0..d0680564ad52f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3301,17 +3301,13 @@ static int __kvm_read_guest_private_page(struct kvm *kvm,
 	folio = pfn_folio(pfn);
 	folio_lock(folio);
 	kaddr = folio_address(folio);
-	if (folio_test_private(folio)) {
-		r = set_direct_map_default_noflush(&folio->page);
-		if (r)
-			goto out_unlock;
-	}
+	r = kvm_gmem_folio_share(folio);
+	if (r)
+		goto out_unlock;
 	memcpy(data, kaddr + offset, len);
-	if (folio_test_private(folio)) {
-		r = set_direct_map_invalid_noflush(&folio->page);
-		if (r)
-			goto out_unlock;
-	}
+	r = kvm_gmem_folio_unshare(folio);
+	if (r)
+		goto out_unlock;
 out_unlock:
 	folio_unlock(folio);
 	folio_put(folio);
@@ -3458,17 +3454,13 @@ static int __kvm_write_guest_private_page(struct kvm *kvm,
 	folio = pfn_folio(pfn);
 	folio_lock(folio);
 	kaddr = folio_address(folio);
-	if (folio_test_private(folio)) {
-		r = set_direct_map_default_noflush(&folio->page);
-		if (r)
-			goto out_unlock;
-	}
+	r = kvm_gmem_folio_share(folio);
+	if (r)
+		goto out_unlock;
 	memcpy(kaddr + offset, data, len);
-	if (folio_test_private(folio)) {
-		r = set_direct_map_invalid_noflush(&folio->page);
-		if (r)
-			goto out_unlock;
-	}
+	r = kvm_gmem_folio_unshare(folio);
+	if (r)
+		goto out_unlock;

 out_unlock:
 	folio_unlock(folio);
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01f..f3fb31a39a66f 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -41,6 +41,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
 		  unsigned int fd, loff_t offset);
 void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+int kvm_gmem_folio_share(struct folio *folio);
+int kvm_gmem_folio_unshare(struct folio *folio);
 #else
 static inline void kvm_gmem_init(struct module *module)
 {
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 55f39fd60f8af..9f955e07efb90 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -111,45 +111,8 @@ static int gpc_map_gmem(kvm_pfn_t pfn)
 	if (((unsigned long)inode->i_private & KVM_GMEM_NO_DIRECT_MAP) == 0)
 		goto out;

-	/* We need to avoid race conditions where set_memory_np is called for
-	 * pages that other parts of KVM still try to access.  We use the
-	 * PG_private bit for this. If it is set, then the page is removed from
-	 * the direct map. If it is cleared, the page is present in the direct
-	 * map. All changes to this bit, and all modifications of the direct
-	 * map entries for the page happen under the page lock. The _only_
-	 * place where a page will be in the direct map while the page lock is
-	 * _not_ held is if it is inside a gpc. All other parts of KVM that
-	 * temporarily re-insert gmem pages into the direct map (currently only
-	 * guest_{read,write}_page) take the page lock before the direct map
-	 * entry is restored, and hold it until it is zapped again. This means
-	 * - If we reach gpc_map while, say, guest_read_page is operating on
-	 *   this page, we block on acquiring the page lock until
-	 *   guest_read_page is done.
-	 * - If we reach gpc_map while another gpc is already caching this
-	 *   page, the page is present in the direct map and the PG_private
-	 *   flag is cleared. Int his case, we return -EINVAL below to avoid
-	 *   two gpcs caching the same page (since we do not ref-count
-	 *   insertions back into the direct map, when the first cache gets
-	 *   invalidated it would "break" the second cache that assumes the
-	 *   page is present in the direct map until the second cache itself
-	 *   gets invalidated).
-	 * - Lastly, if guest_read_page is called for a page inside of a gpc,
-	 *   it will see that the PG_private flag is cleared, and thus assume
-	 *   it is present in the direct map (and leave the direct map entry
-	 *   untouched). Since it will be holding the page lock, it cannot race
-	 *   with gpc_unmap.
-	 */
 	folio_lock(folio);
-	if (folio_test_private(folio)) {
-		r = set_direct_map_default_noflush(&folio->page);
-		if (r)
-			goto out_unlock;
-
-		folio_clear_private(folio);
-	} else {
-		r = -EINVAL;
-	}
-out_unlock:
+	r = kvm_gmem_folio_share(folio);
 	folio_unlock(folio);
 out:
 	return r;
@@ -181,17 +144,10 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva, bool private)
 	if (pfn_valid(pfn)) {
 		if (private) {
 			struct folio *folio = pfn_folio(pfn);
-			struct inode *inode = folio_inode(folio);
-
-			if ((unsigned long)inode->i_private &
-			    KVM_GMEM_NO_DIRECT_MAP) {
-				folio_lock(folio);
-				BUG_ON(folio_test_private(folio));
-				BUG_ON(set_direct_map_invalid_noflush(
-					&folio->page));
-				folio_set_private(folio);
-				folio_unlock(folio);
-			}
+
+			folio_lock(folio);
+			kvm_gmem_folio_unshare(folio);
+			folio_unlock(folio);
 		}
 		kunmap(pfn_to_page(pfn));
 		return;
--
2.46.0