On Tue, Jul 09, 2024 at 02:20:33PM +0100, Patrick Roy wrote: > While guest_memfd is not available to be mapped by userspace, it is > still accessible through the kernel's direct map. This means that in > scenarios where guest-private memory is not hardware protected, it can > be speculatively read and its contents potentially leaked through > hardware side-channels. Removing guest-private memory from the direct > map, thus mitigates a large class of speculative execution issues > [1, Table 1]. > > This patch adds a flag to the `KVM_CREATE_GUEST_MEMFD` which, if set, removes the > struct pages backing guest-private memory from the direct map. Should > `CONFIG_HAVE_KVM_GMEM_{INVALIDATE, PREPARE}` be set, pages are removed > after preparation and before invalidation, so that the > prepare/invalidate routines do not have to worry about potentially > absent direct map entries. > > Direct map removal do not reuse the `KVM_GMEM_PREPARE` machinery, since `prepare` can be > called multiple time, and it is the responsibility of the preparation > routine to not "prepare" the same folio twice [2]. Thus, instead > explicitly check if `filemap_grab_folio` allocated a new folio, and > remove the returned folio from the direct map only if this was the case. > > The patch uses release_folio instead of free_folio to reinsert pages > back into the direct map as by the time free_folio is called, > folio->mapping can already be NULL. This means that a call to > folio_inode inside free_folio might deference a NULL pointer, leaving no > way to access the inode which stores the flags that allow determining > whether the page was removed from the direct map in the first place. > > Lastly, the patch uses set_direct_map_{invalid,default}_noflush instead > of `set_memory_[n]p` to avoid expensive flushes of TLBs and the L*-cache > hierarchy. This is especially important once KVM restores direct map > entries on-demand in later patches, where simple FIO benchmarks of a > virtio-blk device have shown that TLB flushes on a Intel(R) Xeon(R) > Platinum 8375C CPU @ 2.90GHz resulted in 80% degradation in throughput > compared to a non-flushing solution. > > Not flushing the TLB means that until TLB entries for temporarily > restored direct map entries get naturally evicted, they can be used > during speculative execution, and effectively "unhide" the memory for > longer than intended. We consider this acceptable, as the only pages > that are temporarily reinserted into the direct map like this will > either hold PV data structures (kvm-clock, asyncpf, etc), or pages > containing privileged instructions inside the guest kernel image (in the > MMIO emulation case). > > [1]: https://download.vusec.net/papers/quarantine_raid23.pdf > > Signed-off-by: Patrick Roy <roypat@xxxxxxxxxxxx> > --- > include/uapi/linux/kvm.h | 2 ++ > virt/kvm/guest_memfd.c | 52 ++++++++++++++++++++++++++++++++++------ > 2 files changed, 47 insertions(+), 7 deletions(-) > > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h > index e065d9fe7ab2..409116aa23c9 100644 > --- a/include/uapi/linux/kvm.h > +++ b/include/uapi/linux/kvm.h > @@ -1563,4 +1563,6 @@ struct kvm_create_guest_memfd { > __u64 reserved[6]; > }; > > +#define KVM_GMEM_NO_DIRECT_MAP (1ULL << 0) > + > #endif /* __LINUX_KVM_H */ > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 9148b9679bb1..dc9b0c2d0b0e 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -4,6 +4,7 @@ > #include <linux/kvm_host.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> > +#include <linux/set_memory.h> > > #include "kvm_mm.h" > > @@ -49,9 +50,16 @@ static int kvm_gmem_prepare_folio(struct inode *inode, pgoff_t index, struct fol > return 0; > } > > +static bool kvm_gmem_not_present(struct inode *inode) > +{ > + return ((unsigned long)inode->i_private & KVM_GMEM_NO_DIRECT_MAP) != 0; > +} > + > static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool prepare) > { > struct folio *folio; > + bool zap_direct_map = false; > + int r; > > /* TODO: Support huge pages. */ > folio = filemap_grab_folio(inode->i_mapping, index); > @@ -74,16 +82,30 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool > for (i = 0; i < nr_pages; i++) > clear_highpage(folio_page(folio, i)); > > + // We need to clear the folio before calling kvm_gmem_prepare_folio, > + // but can only remove it from the direct map _after_ preparation is done. No C++ comments please > + zap_direct_map = kvm_gmem_not_present(inode); > + > folio_mark_uptodate(folio); > } > > if (prepare) { > - int r = kvm_gmem_prepare_folio(inode, index, folio); > - if (r < 0) { > - folio_unlock(folio); > - folio_put(folio); > - return ERR_PTR(r); > - } > + r = kvm_gmem_prepare_folio(inode, index, folio); > + if (r < 0) > + goto out_err; > + } > + > + if (zap_direct_map) { > + r = set_direct_map_invalid_noflush(&folio->page); It's not future proof to presume that folio is a single page here. You should loop over folio pages and add a TLB flush after the loop. > + if (r < 0) > + goto out_err; > + > + // We use the private flag to track whether the folio has been removed > + // from the direct map. This is because inside of ->free_folio, > + // we do not have access to the address_space anymore, meaning we > + // cannot check folio_inode(folio)->i_private to determine whether > + // KVM_GMEM_NO_DIRECT_MAP was set. > + folio_set_private(folio); > } > > /* > @@ -91,6 +113,10 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index, bool > * unevictable and there is no storage to write back to. > */ > return folio; > +out_err: > + folio_unlock(folio); > + folio_put(folio); > + return ERR_PTR(r); > } > > static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, > @@ -354,10 +380,22 @@ static void kvm_gmem_free_folio(struct folio *folio) > } > #endif > > +static void kvm_gmem_invalidate_folio(struct folio *folio, size_t start, size_t end) > +{ > + if (start == 0 && end == PAGE_SIZE) { > + // We only get here if PG_private is set, which only happens if kvm_gmem_not_present > + // returned true in kvm_gmem_get_folio. Thus no need to do that check again. > + BUG_ON(set_direct_map_default_noflush(&folio->page)); Ditto. > + > + folio_clear_private(folio); > + } > +} > + > static const struct address_space_operations kvm_gmem_aops = { > .dirty_folio = noop_dirty_folio, > .migrate_folio = kvm_gmem_migrate_folio, > .error_remove_folio = kvm_gmem_error_folio, > + .invalidate_folio = kvm_gmem_invalidate_folio, > #ifdef CONFIG_HAVE_KVM_GMEM_INVALIDATE > .free_folio = kvm_gmem_free_folio, > #endif > @@ -443,7 +481,7 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) > { > loff_t size = args->size; > u64 flags = args->flags; > - u64 valid_flags = 0; > + u64 valid_flags = KVM_GMEM_NO_DIRECT_MAP; > > if (flags & ~valid_flags) > return -EINVAL; > -- > 2.45.2 > -- Sincerely yours, Mike.