[RFC PATCH 6/8] kvm: gmem: Temporarily restore direct map entries when needed

Patrick Roy <roypat@xxxxxxxxxxxx> · Tue, 9 Jul 2024 14:20:34 +0100

If KVM_GMEM_NO_DIRECT_MAP is set, and KVM tries to internally access
guest-private memory inside kvm_{read,write}_guest, or via a
gfn_to_pfn_cache, temporarily restore the direct map entry.

To avoid race conditions between two threads restoring or zapping direct
map entries for the same page and potentially interfering with each
other (e.g. unfortune interweavings of map->read->unmap in the form of
map(A)->map(B)->read(A)->unmap(A)->read(B) [BOOM]), the following
invariant is upheld in this patch:

- Only a single gfn_to_pfn_cache can exist for any given pfn, and
- All non-gfn_to_pfn_cache code paths that temporarily restore direct
  map entries complete the entire map->access->unmap critical section
while holding the folio lock.

To remember whether a given folio currently has a direct map entry, use
the PG_private flag. If this flag is set, then the folio is removed from
the direct map, otherwise it is present in the direct map.
Modifications of this flag, together with the corresponding direct map
manipulations, must happen while holding the folio's lock.

A gfn_to_pfn_cache cannot hold the folio lock for its entire lifetime,
so it operates as follows: In gpc_map, under folio lock, restore the
direct map entry and set PG_private to 0. In gpc_unmap, zap the direct
map entry again and set PG_private back to 1.

If inside gpc_map the cache finds a folio that has PG_private set to 0,
it knows that another gfn_to_pfn_cache is currently active for the given
pfn (as this is the only scenario in which PG_private can be 0 without
the folio lock being held), and so it returns -EINVAL.

The only other interesting scenario is then if kvm_{read,write}_guest is
called for a gfn whose translation is currently cached inside a
gfn_to_pfn_cache. In this case, kvm_{read,write}_guest notices that
PG_private is 0 and skips all direct map manipulations. Since it is
holding the folio lock, it can be sure that gpc_unmap cannot
concurrently zap the direct map entries while kvm_{read,write}_guest
still needs them.

Note that this implementation is slightly too restrictive, as sometimes
multiple gfn_to_pfn_caches need to be active for the same gfn (for
example, each vCPU has its own kvm-clock structure, which they all try
to put into the same gfn).

Signed-off-by: Patrick Roy <roypat@xxxxxxxxxxxx>
---
 virt/kvm/kvm_main.c | 59 +++++++++++++++++++++---------
 virt/kvm/pfncache.c | 89 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 123 insertions(+), 25 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4357f7cdf040..f968f1f3d7f7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -52,6 +52,7 @@
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
+#include <asm/set_memory.h>
 #include <linux/uaccess.h>
 
 #include "coalesced_mmio.h"
@@ -3291,8 +3292,8 @@ static int __kvm_read_guest_private_page(struct kvm *kvm,
 					 void *data, int offset, int len)
 {
 	kvm_pfn_t pfn;
-	int r;
-	struct page *page;
+	int r = 0;
+	struct folio *folio;
 	void *kaddr;
 
 	if (!kvm_can_access_gmem(kvm))
@@ -3303,13 +3304,24 @@ static int __kvm_read_guest_private_page(struct kvm *kvm,
 	if (r < 0)
 		return -EFAULT;
 
-	page = pfn_to_page(pfn);
-	lock_page(page);
-	kaddr = page_address(page) + offset;
-	memcpy(data, kaddr, len);
-	unlock_page(page);
-	put_page(page);
-	return 0;
+	folio = pfn_folio(pfn);
+	folio_lock(folio);
+	kaddr = folio_address(folio);
+	if (folio_test_private(folio)) {
+		r = set_direct_map_default_noflush(&folio->page);
+		if (r)
+			goto out_unlock;
+	}
+	memcpy(data, kaddr + offset, len);
+	if (folio_test_private(folio)) {
+		r = set_direct_map_invalid_noflush(&folio->page);
+		if (r)
+			goto out_unlock;
+	}
+out_unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+	return r;
 }
 
 static int __kvm_vcpu_read_guest_private_page(struct kvm_vcpu *vcpu,
@@ -3437,8 +3449,8 @@ static int __kvm_write_guest_private_page(struct kvm *kvm,
 					  const void *data, int offset, int len)
 {
 	kvm_pfn_t pfn;
-	int r;
-	struct page *page;
+	int r = 0;
+	struct folio *folio;
 	void *kaddr;
 
 	if (!kvm_can_access_gmem(kvm))
@@ -3449,14 +3461,25 @@ static int __kvm_write_guest_private_page(struct kvm *kvm,
 	if (r < 0)
 		return -EFAULT;
 
-	page = pfn_to_page(pfn);
-	lock_page(page);
-	kaddr = page_address(page) + offset;
-	memcpy(kaddr, data, len);
-	unlock_page(page);
-	put_page(page);
+	folio = pfn_folio(pfn);
+	folio_lock(folio);
+	kaddr = folio_address(folio);
+	if (folio_test_private(folio)) {
+		r = set_direct_map_default_noflush(&folio->page);
+		if (r)
+			goto out_unlock;
+	}
+	memcpy(kaddr + offset, data, len);
+	if (folio_test_private(folio)) {
+		r = set_direct_map_invalid_noflush(&folio->page);
+		if (r)
+			goto out_unlock;
+	}
 
-	return 0;
+out_unlock:
+	folio_unlock(folio);
+	folio_put(folio);
+	return r;
 }
 
 static int __kvm_vcpu_write_guest_private_page(struct kvm_vcpu *vcpu,
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index 6430e0a49558..95d2d5cdaa12 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -16,6 +16,9 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/errno.h>
+#include <linux/pagemap.h>
+
+#include <asm/set_memory.h>
 
 #include "kvm_mm.h"
 
@@ -99,10 +102,68 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
 	return true;
 }
 
-static void *gpc_map(kvm_pfn_t pfn)
+static int gpc_map_gmem(kvm_pfn_t pfn)
 {
-	if (pfn_valid(pfn))
+	int r = 0;
+	struct folio *folio = pfn_folio(pfn);
+	struct inode *inode = folio_inode(folio);
+
+	if (((unsigned long)inode->i_private & KVM_GMEM_NO_DIRECT_MAP) == 0)
+		goto out;
+
+	/* We need to avoid race conditions where set_memory_np is called for
+	 * pages that other parts of KVM still try to access.  We use the
+	 * PG_private bit for this. If it is set, then the page is removed from
+	 * the direct map. If it is cleared, the page is present in the direct
+	 * map. All changes to this bit, and all modifications of the direct
+	 * map entries for the page happen under the page lock. The _only_
+	 * place where a page will be in the direct map while the page lock is
+	 * _not_ held is if it is inside a gpc. All other parts of KVM that
+	 * temporarily re-insert gmem pages into the direct map (currently only
+	 * guest_{read,write}_page) take the page lock before the direct map
+	 * entry is restored, and hold it until it is zapped again. This means
+	 * - If we reach gpc_map while, say, guest_read_page is operating on
+	 *   this page, we block on acquiring the page lock until
+	 *   guest_read_page is done.
+	 * - If we reach gpc_map while another gpc is already caching this
+	 *   page, the page is present in the direct map and the PG_private
+	 *   flag is cleared. Int his case, we return -EINVAL below to avoid
+	 *   two gpcs caching the same page (since we do not ref-count
+	 *   insertions back into the direct map, when the first cache gets
+	 *   invalidated it would "break" the second cache that assumes the
+	 *   page is present in the direct map until the second cache itself
+	 *   gets invalidated).
+	 * - Lastly, if guest_read_page is called for a page inside of a gpc,
+	 *   it will see that the PG_private flag is cleared, and thus assume
+	 *   it is present in the direct map (and leave the direct map entry
+	 *   untouched). Since it will be holding the page lock, it cannot race
+	 *   with gpc_unmap.
+	 */
+	folio_lock(folio);
+	if (folio_test_private(folio)) {
+		r = set_direct_map_default_noflush(&folio->page);
+		if (r)
+			goto out_unlock;
+
+		folio_clear_private(folio);
+	} else {
+		r = -EINVAL;
+	}
+out_unlock:
+	folio_unlock(folio);
+out:
+	return r;
+}
+
+static void *gpc_map(kvm_pfn_t pfn, bool private)
+{
+	if (pfn_valid(pfn)) {
+		if (private) {
+			if (gpc_map_gmem(pfn))
+				return NULL;
+		}
 		return kmap(pfn_to_page(pfn));
+	}
 
 #ifdef CONFIG_HAS_IOMEM
 	return memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
@@ -111,13 +172,27 @@ static void *gpc_map(kvm_pfn_t pfn)
 #endif
 }
 
-static void gpc_unmap(kvm_pfn_t pfn, void *khva)
+static void gpc_unmap(kvm_pfn_t pfn, void *khva, bool private)
 {
 	/* Unmap the old pfn/page if it was mapped before. */
 	if (is_error_noslot_pfn(pfn) || !khva)
 		return;
 
 	if (pfn_valid(pfn)) {
+		if (private) {
+			struct folio *folio = pfn_folio(pfn);
+			struct inode *inode = folio_inode(folio);
+
+			if ((unsigned long)inode->i_private &
+			    KVM_GMEM_NO_DIRECT_MAP) {
+				folio_lock(folio);
+				BUG_ON(folio_test_private(folio));
+				BUG_ON(set_direct_map_invalid_noflush(
+					&folio->page));
+				folio_set_private(folio);
+				folio_unlock(folio);
+			}
+		}
 		kunmap(pfn_to_page(pfn));
 		return;
 	}
@@ -195,7 +270,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 			 * the existing mapping and didn't create a new one.
 			 */
 			if (new_khva != old_khva)
-				gpc_unmap(new_pfn, new_khva);
+				gpc_unmap(new_pfn, new_khva, gpc->is_private);
 
 			kvm_release_pfn_clean(new_pfn);
 
@@ -224,7 +299,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 		if (new_pfn == gpc->pfn)
 			new_khva = old_khva;
 		else
-			new_khva = gpc_map(new_pfn);
+			new_khva = gpc_map(new_pfn, gpc->is_private);
 
 		if (!new_khva) {
 			kvm_release_pfn_clean(new_pfn);
@@ -379,7 +454,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
 	write_unlock_irq(&gpc->lock);
 
 	if (unmap_old)
-		gpc_unmap(old_pfn, old_khva);
+		gpc_unmap(old_pfn, old_khva, old_private);
 
 	return ret;
 }
@@ -500,6 +575,6 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
 		list_del(&gpc->list);
 		spin_unlock(&kvm->gpc_lock);
 
-		gpc_unmap(old_pfn, old_khva);
+		gpc_unmap(old_pfn, old_khva, gpc->is_private);
 	}
 }
-- 
2.45.2