Add support for mmap() and fault() for guest_memfd in the host. The ability to fault in a guest page is contingent on that page being shared with the host. The guest_memfd PRIVATE memory attribute is not used for two reasons. First because it reflects the userspace expectation for that memory location, and therefore can be toggled by userspace. The second is, although each guest_memfd file has a 1:1 binding with a KVM instance, the plan is to allow multiple files per inode, e.g. to allow intra-host migration to a new KVM instance, without destroying guest_memfd. The mapping is restricted to only memory explicitly shared with the host. KVM checks that the host doesn't have any mappings for private memory via the folio's refcount. To avoid races between paths that check mappability and paths that check whether the host has any mappings (via the refcount), the folio lock is held in while either check is being performed. This new feature is gated with a new configuration option, CONFIG_KVM_GMEM_MAPPABLE. Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx> Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx> Co-developed-by: Elliot Berman <quic_eberman@xxxxxxxxxxx> Signed-off-by: Elliot Berman <quic_eberman@xxxxxxxxxxx> Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx> --- Note that the functions kvm_gmem_is_mapped(), kvm_gmem_set_mappable(), and int kvm_gmem_clear_mappable() are not used in this patch series. They are intended to be used in future patches [*], which check and toggle mapability when the guest shares/unshares pages with the host. [*] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/guestmem-6.12-v3-pkvm --- include/linux/kvm_host.h | 52 +++++++++++ virt/kvm/Kconfig | 4 + virt/kvm/guest_memfd.c | 185 +++++++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 138 +++++++++++++++++++++++++++++ 4 files changed, 379 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index acf85995b582..bda7fda9945e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2527,4 +2527,56 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif +#ifdef CONFIG_KVM_GMEM_MAPPABLE +bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end); +bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end); +int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end); +int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end); +int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start, + gfn_t end); +int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start, + gfn_t end); +bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn); +#else +static inline bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end) +{ + WARN_ON_ONCE(1); + return false; +} +static inline bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end) +{ + WARN_ON_ONCE(1); + return false; +} +static inline int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} +static inline int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, + gfn_t end) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} +static inline int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, + gfn_t start, gfn_t end) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} +static inline int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, + gfn_t start, gfn_t end) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} +static inline bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, + gfn_t gfn) +{ + WARN_ON_ONCE(1); + return false; +} +#endif /* CONFIG_KVM_GMEM_MAPPABLE */ + #endif diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index fd6a3010afa8..2cfcb0848e37 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -120,3 +120,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE config HAVE_KVM_ARCH_GMEM_INVALIDATE bool depends on KVM_PRIVATE_MEM + +config KVM_GMEM_MAPPABLE + select KVM_PRIVATE_MEM + bool diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index f414646c475b..df3a6f05a16e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -370,7 +370,184 @@ static void kvm_gmem_init_mount(void) kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; } +#ifdef CONFIG_KVM_GMEM_MAPPABLE +static struct folio * +__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared, + int *max_order); + +static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end) +{ + struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; + void *xval = xa_mk_value(true); + pgoff_t i; + bool r; + + filemap_invalidate_lock(inode->i_mapping); + for (i = start; i < end; i++) { + r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL)); + if (r) + break; + } + filemap_invalidate_unlock(inode->i_mapping); + + return r; +} + +static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end) +{ + struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; + pgoff_t i; + int r = 0; + + filemap_invalidate_lock(inode->i_mapping); + for (i = start; i < end; i++) { + struct folio *folio; + + /* + * Holds the folio lock until after checking its refcount, + * to avoid races with paths that fault in the folio. + */ + folio = kvm_gmem_get_folio(inode, i); + if (WARN_ON_ONCE(IS_ERR(folio))) + continue; + + /* + * Check that the host doesn't have any mappings on clearing + * the mappable flag, because clearing the flag implies that the + * memory will be unshared from the host. Therefore, to maintain + * the invariant that the host cannot access private memory, we + * need to check that it doesn't have any mappings to that + * memory before making it private. + * + * Two references are expected because of kvm_gmem_get_folio(). + */ + if (folio_ref_count(folio) > 2) + r = -EPERM; + else + xa_erase(mappable_offsets, i); + + folio_put(folio); + folio_unlock(folio); + + if (r) + break; + } + filemap_invalidate_unlock(inode->i_mapping); + + return r; +} + +static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff) +{ + struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets; + bool r; + + filemap_invalidate_lock_shared(inode->i_mapping); + r = xa_find(mappable_offsets, &pgoff, pgoff, XA_PRESENT); + filemap_invalidate_unlock_shared(inode->i_mapping); + + return r; +} + +int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) +{ + struct inode *inode = file_inode(slot->gmem.file); + pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn; + pgoff_t end_off = start_off + end - start; + + return gmem_set_mappable(inode, start_off, end_off); +} + +int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) +{ + struct inode *inode = file_inode(slot->gmem.file); + pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn; + pgoff_t end_off = start_off + end - start; + + return gmem_clear_mappable(inode, start_off, end_off); +} + +bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn) +{ + struct inode *inode = file_inode(slot->gmem.file); + unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn; + + return gmem_is_mappable(inode, pgoff); +} + +static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct folio *folio; + vm_fault_t ret = VM_FAULT_LOCKED; + + /* + * Holds the folio lock until after checking whether it can be faulted + * in, to avoid races with paths that change a folio's mappability. + */ + folio = kvm_gmem_get_folio(inode, vmf->pgoff); + if (!folio) + return VM_FAULT_SIGBUS; + + if (folio_test_hwpoison(folio)) { + ret = VM_FAULT_HWPOISON; + goto out; + } + + if (!gmem_is_mappable(inode, vmf->pgoff)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + if (!folio_test_uptodate(folio)) { + unsigned long nr_pages = folio_nr_pages(folio); + unsigned long i; + + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); + + folio_mark_uptodate(folio); + } + + vmf->page = folio_file_page(folio, vmf->pgoff); +out: + if (ret != VM_FAULT_LOCKED) { + folio_put(folio); + folio_unlock(folio); + } + + return ret; +} + +static const struct vm_operations_struct kvm_gmem_vm_ops = { + .fault = kvm_gmem_fault, +}; + +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != + (VM_SHARED | VM_MAYSHARE)) { + return -EINVAL; + } + + file_accessed(file); + vm_flags_set(vma, VM_DONTDUMP); + vma->vm_ops = &kvm_gmem_vm_ops; + + return 0; +} +#else +static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} +#define kvm_gmem_mmap NULL +#endif /* CONFIG_KVM_GMEM_MAPPABLE */ + static struct file_operations kvm_gmem_fops = { + .mmap = kvm_gmem_mmap, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -557,6 +734,14 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) goto err_gmem; } + if (IS_ENABLED(CONFIG_KVM_GMEM_MAPPABLE)) { + err = gmem_set_mappable(file_inode(file), 0, size >> PAGE_SHIFT); + if (err) { + fput(file); + goto err_gmem; + } + } + kvm_get_kvm(kvm); gmem->kvm = kvm; xa_init(&gmem->bindings); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 05cbb2548d99..aed9cf2f1685 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3263,6 +3263,144 @@ static int next_segment(unsigned long len, int offset) return len; } +#ifdef CONFIG_KVM_GMEM_MAPPABLE +static bool __kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end) +{ + struct kvm_memslot_iter iter; + + lockdep_assert_held(&kvm->slots_lock); + + kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) { + struct kvm_memory_slot *memslot = iter.slot; + gfn_t gfn_start, gfn_end, i; + + gfn_start = max(start, memslot->base_gfn); + gfn_end = min(end, memslot->base_gfn + memslot->npages); + if (WARN_ON_ONCE(gfn_start >= gfn_end)) + continue; + + for (i = gfn_start; i < gfn_end; i++) { + if (!kvm_slot_gmem_is_mappable(memslot, i)) + return false; + } + } + + return true; +} + +bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end) +{ + bool r; + + mutex_lock(&kvm->slots_lock); + r = __kvm_gmem_is_mappable(kvm, start, end); + mutex_unlock(&kvm->slots_lock); + + return r; +} + +static bool kvm_gmem_is_pfn_mapped(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn_idx) +{ + struct page *page; + bool is_mapped; + kvm_pfn_t pfn; + + /* + * Holds the folio lock until after checking its refcount, + * to avoid races with paths that fault in the folio. + */ + if (WARN_ON_ONCE(kvm_gmem_get_pfn_locked(kvm, memslot, gfn_idx, &pfn, NULL))) + return false; + + page = pfn_to_page(pfn); + + /* Two references are expected because of kvm_gmem_get_pfn_locked(). */ + is_mapped = page_ref_count(page) > 2; + + put_page(page); + unlock_page(page); + + return is_mapped; +} + +static bool __kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end) +{ + struct kvm_memslot_iter iter; + + lockdep_assert_held(&kvm->slots_lock); + + kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) { + struct kvm_memory_slot *memslot = iter.slot; + gfn_t gfn_start, gfn_end, i; + + gfn_start = max(start, memslot->base_gfn); + gfn_end = min(end, memslot->base_gfn + memslot->npages); + if (WARN_ON_ONCE(gfn_start >= gfn_end)) + continue; + + for (i = gfn_start; i < gfn_end; i++) { + if (kvm_gmem_is_pfn_mapped(kvm, memslot, i)) + return true; + } + } + + return false; +} + +bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end) +{ + bool r; + + mutex_lock(&kvm->slots_lock); + r = __kvm_gmem_is_mapped(kvm, start, end); + mutex_unlock(&kvm->slots_lock); + + return r; +} + +static int kvm_gmem_toggle_mappable(struct kvm *kvm, gfn_t start, gfn_t end, + bool is_mappable) +{ + struct kvm_memslot_iter iter; + int r = 0; + + mutex_lock(&kvm->slots_lock); + + kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) { + struct kvm_memory_slot *memslot = iter.slot; + gfn_t gfn_start, gfn_end; + + gfn_start = max(start, memslot->base_gfn); + gfn_end = min(end, memslot->base_gfn + memslot->npages); + if (WARN_ON_ONCE(start >= end)) + continue; + + if (is_mappable) + r = kvm_slot_gmem_set_mappable(memslot, gfn_start, gfn_end); + else + r = kvm_slot_gmem_clear_mappable(memslot, gfn_start, gfn_end); + + if (WARN_ON_ONCE(r)) + break; + } + + mutex_unlock(&kvm->slots_lock); + + return r; +} + +int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end) +{ + return kvm_gmem_toggle_mappable(kvm, start, end, true); +} + +int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end) +{ + return kvm_gmem_toggle_mappable(kvm, start, end, false); +} + +#endif /* CONFIG_KVM_GMEM_MAPPABLE */ + /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, int len) -- 2.47.0.rc0.187.ge670bccf7e-goog