[RFC PATCH 5/8] kvm: gmem: add option to remove guest private memory from direct map

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



While guest_memfd is not available to be mapped by userspace, it is
still accessible through the kernel's direct map. This means that in
scenarios where guest-private memory is not hardware protected, it can
be speculatively read and its contents potentially leaked through
hardware side-channels. Removing guest-private memory from the direct
map, thus mitigates a large class of speculative execution issues
[1, Table 1].

This patch adds a flag to the `KVM_CREATE_GUEST_MEMFD` which, if set, removes the
struct pages backing guest-private memory from the direct map. Should
`CONFIG_HAVE_KVM_GMEM_{INVALIDATE, PREPARE}` be set, pages are removed
after preparation and before invalidation, so that the
prepare/invalidate routines do not have to worry about potentially
absent direct map entries.

Direct map removal do not reuse the `KVM_GMEM_PREPARE` machinery, since `prepare` can be
called multiple time, and it is the responsibility of the preparation
routine to not "prepare" the same folio twice [2]. Thus, instead
explicitly check if `filemap_grab_folio` allocated a new folio, and
remove the returned folio from the direct map only if this was the case.

The patch uses release_folio instead of free_folio to reinsert pages
back into the direct map as by the time free_folio is called,
folio->mapping can already be NULL. This means that a call to
folio_inode inside free_folio might deference a NULL pointer, leaving no
way to access the inode which stores the flags that allow determining
whether the page was removed from the direct map in the first place.

Lastly, the patch uses set_direct_map_{invalid,default}_noflush instead
of `set_memory_[n]p` to avoid expensive flushes of TLBs and the L*-cache
hierarchy. This is especially important once KVM restores direct map
entries on-demand in later patches, where simple FIO benchmarks of a
virtio-blk device have shown that TLB flushes on a Intel(R) Xeon(R)
Platinum 8375C CPU @ 2.90GHz resulted in 80% degradation in throughput
compared to a non-flushing solution.

Not flushing the TLB means that until TLB entries for temporarily
restored direct map entries get naturally evicted, they can be used
during speculative execution, and effectively "unhide" the memory for
longer than intended. We consider this acceptable, as the only pages
that are temporarily reinserted into the direct map like this will
either hold PV data structures (kvm-clock, asyncpf, etc), or pages
containing privileged instructions inside the guest kernel image (in the
MMIO emulation case).

[1]: https://download.vusec.net/papers/quarantine_raid23.pdf

Signed-off-by: Patrick Roy <roypat@xxxxxxxxxxxx>
---
 include/uapi/linux/kvm.h |  2 ++
 virt/kvm/guest_memfd.c   | 52 ++++++++++++++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index e065d9fe7ab2..409116aa23c9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1563,4 +1563,6 @@ struct kvm_create_guest_memfd {
 	__u64 reserved[6];
 };
 
+#define KVM_GMEM_NO_DIRECT_MAP                 (1ULL << 0)
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 9148b9679bb1..dc9b0c2d0b0e 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
 #include <linux/kvm_host.h>
 #include <linux/pagemap.h>
 #include <linux/anon_inodes.h>
+#include <linux/set_memory.h>
 
 #include "kvm_mm.h"
 
@@ -49,9 +50,16 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol
 	return 0;
 }
 
+static bool kvm_gmem_not_present(struct inode *inode)
+{
+	return ((unsigned long)inode->i_private & KVM_GMEM_NO_DIRECT_MAP) != 0;
+}
+
 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare)
 {
 	struct folio *folio;
+	bool zap_direct_map = false;
+	int r;
 
 	/* TODO: Support huge pages. */
 	folio = filemap_grab_folio(inode->i_mapping, index);
@@ -74,16 +82,30 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool
 		for (i = 0; i < nr_pages; i++)
 			clear_highpage(folio_page(folio, i));
 
+		// We need to clear the folio before calling kvm_gmem_prepare_folio,
+		// but can only remove it from the direct map _after_ preparation is done.
+		zap_direct_map = kvm_gmem_not_present(inode);
+
 		folio_mark_uptodate(folio);
 	}
 
 	if (prepare) {
-		int r =	kvm_gmem_prepare_folio(inode, index, folio);
-		if (r < 0) {
-			folio_unlock(folio);
-			folio_put(folio);
-			return ERR_PTR(r);
-		}
+		r = kvm_gmem_prepare_folio(inode, index, folio);
+		if (r < 0)
+			goto out_err;
+	}
+
+	if (zap_direct_map) {
+		r = set_direct_map_invalid_noflush(&folio->page);
+		if (r < 0)
+			goto out_err;
+
+		// We use the private flag to track whether the folio has been removed
+		// from the direct map. This is because inside of ->free_folio,
+		// we do not have access to the address_space anymore, meaning we
+		// cannot check folio_inode(folio)->i_private to determine whether
+		// KVM_GMEM_NO_DIRECT_MAP was set.
+		folio_set_private(folio);
 	}
 
 	/*
@@ -91,6 +113,10 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool
 	 * unevictable and there is no storage to write back to.
 	 */
 	return folio;
+out_err:
+	folio_unlock(folio);
+	folio_put(folio);
+	return ERR_PTR(r);
 }
 
 static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -354,10 +380,22 @@ static void kvm_gmem_free_folio(struct folio *folio)
 }
 #endif
 
+static void kvm_gmem_invalidate_folio(struct folio *folio, size_t start, size_t end)
+{
+	if (start == 0 && end == PAGE_SIZE) {
+		// We only get here if PG_private is set, which only happens if kvm_gmem_not_present
+		// returned true in kvm_gmem_get_folio. Thus no need to do that check again.
+		BUG_ON(set_direct_map_default_noflush(&folio->page));
+
+		folio_clear_private(folio);
+	}
+}
+
 static const struct address_space_operations kvm_gmem_aops = {
 	.dirty_folio = noop_dirty_folio,
 	.migrate_folio	= kvm_gmem_migrate_folio,
 	.error_remove_folio = kvm_gmem_error_folio,
+	.invalidate_folio = kvm_gmem_invalidate_folio,
 #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE
 	.free_folio = kvm_gmem_free_folio,
 #endif
@@ -443,7 +481,7 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
 {
 	loff_t size = args->size;
 	u64 flags = args->flags;
-	u64 valid_flags = 0;
+	u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP;
 
 	if (flags & ~valid_flags)
 		return -EINVAL;
-- 
2.45.2





[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux