While guest_memfd is not available to be mapped by userspace, it is still accessible through the kernel's direct map. This means that in scenarios where guest-private memory is not hardware protected, it can be speculatively read and its contents potentially leaked through hardware side-channels. Removing guest-private memory from the direct map, thus mitigates a large class of speculative execution issues [1, Table 1]. This patch adds a flag to the `KVM_CREATE_GUEST_MEMFD` which, if set, removes the struct pages backing guest-private memory from the direct map. Should `CONFIG_HAVE_KVM_GMEM_{INVALIDATE, PREPARE}` be set, pages are removed after preparation and before invalidation, so that the prepare/invalidate routines do not have to worry about potentially absent direct map entries. Direct map removal do not reuse the `KVM_GMEM_PREPARE` machinery, since `prepare` can be called multiple time, and it is the responsibility of the preparation routine to not "prepare" the same folio twice [2]. Thus, instead explicitly check if `filemap_grab_folio` allocated a new folio, and remove the returned folio from the direct map only if this was the case. The patch uses release_folio instead of free_folio to reinsert pages back into the direct map as by the time free_folio is called, folio->mapping can already be NULL. This means that a call to folio_inode inside free_folio might deference a NULL pointer, leaving no way to access the inode which stores the flags that allow determining whether the page was removed from the direct map in the first place. Lastly, the patch uses set_direct_map_{invalid,default}_noflush instead of `set_memory_[n]p` to avoid expensive flushes of TLBs and the L*-cache hierarchy. This is especially important once KVM restores direct map entries on-demand in later patches, where simple FIO benchmarks of a virtio-blk device have shown that TLB flushes on a Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz resulted in 80% degradation in throughput compared to a non-flushing solution. Not flushing the TLB means that until TLB entries for temporarily restored direct map entries get naturally evicted, they can be used during speculative execution, and effectively "unhide" the memory for longer than intended. We consider this acceptable, as the only pages that are temporarily reinserted into the direct map like this will either hold PV data structures (kvm-clock, asyncpf, etc), or pages containing privileged instructions inside the guest kernel image (in the MMIO emulation case). [1]: https://download.vusec.net/papers/quarantine_raid23.pdf Signed-off-by: Patrick Roy <roypat@xxxxxxxxxxxx> --- include/uapi/linux/kvm.h | 2 ++ virt/kvm/guest_memfd.c | 52 ++++++++++++++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e065d9fe7ab2..409116aa23c9 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1563,4 +1563,6 @@ struct kvm_create_guest_memfd { __u64 reserved[6]; }; +#define KVM_GMEM_NO_DIRECT_MAP (1ULL << 0) + #endif /* __LINUX_KVM_H */ diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 9148b9679bb1..dc9b0c2d0b0e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -4,6 +4,7 @@ #include <linux/kvm_host.h> #include <linux/pagemap.h> #include <linux/anon_inodes.h> +#include <linux/set_memory.h> #include "kvm_mm.h" @@ -49,9 +50,16 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol return 0; } +static bool kvm_gmem_not_present(struct inode *inode) +{ + return ((unsigned long)inode->i_private & KVM_GMEM_NO_DIRECT_MAP) != 0; +} + static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) { struct folio *folio; + bool zap_direct_map = false; + int r; /* TODO: Support huge pages. */ folio = filemap_grab_folio(inode->i_mapping, index); @@ -74,16 +82,30 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool for (i = 0; i < nr_pages; i++) clear_highpage(folio_page(folio, i)); + // We need to clear the folio before calling kvm_gmem_prepare_folio, + // but can only remove it from the direct map _after_ preparation is done. + zap_direct_map = kvm_gmem_not_present(inode); + folio_mark_uptodate(folio); } if (prepare) { - int r = kvm_gmem_prepare_folio(inode, index, folio); - if (r < 0) { - folio_unlock(folio); - folio_put(folio); - return ERR_PTR(r); - } + r = kvm_gmem_prepare_folio(inode, index, folio); + if (r < 0) + goto out_err; + } + + if (zap_direct_map) { + r = set_direct_map_invalid_noflush(&folio->page); + if (r < 0) + goto out_err; + + // We use the private flag to track whether the folio has been removed + // from the direct map. This is because inside of ->free_folio, + // we do not have access to the address_space anymore, meaning we + // cannot check folio_inode(folio)->i_private to determine whether + // KVM_GMEM_NO_DIRECT_MAP was set. + folio_set_private(folio); } /* @@ -91,6 +113,10 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool * unevictable and there is no storage to write back to. */ return folio; +out_err: + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(r); } static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, @@ -354,10 +380,22 @@ static void kvm_gmem_free_folio(struct folio *folio) } #endif +static void kvm_gmem_invalidate_folio(struct folio *folio, size_t start, size_t end) +{ + if (start == 0 && end == PAGE_SIZE) { + // We only get here if PG_private is set, which only happens if kvm_gmem_not_present + // returned true in kvm_gmem_get_folio. Thus no need to do that check again. + BUG_ON(set_direct_map_default_noflush(&folio->page)); + + folio_clear_private(folio); + } +} + static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, + .invalidate_folio = kvm_gmem_invalidate_folio, #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE .free_folio = kvm_gmem_free_folio, #endif @@ -443,7 +481,7 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) { loff_t size = args->size; u64 flags = args->flags; - u64 valid_flags = 0; + u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP; if (flags & ~valid_flags) return -EINVAL; -- 2.45.2