Fuad Tabba <tabba@xxxxxxxxxx> writes: > Make kvm_(read|/write)_guest_page() capable of accessing guest > memory for slots that don't have a userspace address, but only if > the memory is mappable, which also indicates that it is > accessible by the host. Fuad explained to me that this patch removes the need for userspace to mmap a guest_memfd fd just to provide userspace_addr when only a limited range of shared pages are required, e.g. for kvm_clock. Questions to anyone who might be more familiar: 1. Should we let userspace save the trouble of providing userspace_addr if only KVM (and not userspace) needs to access the shared pages? 2. Other than kvm_{read,write}_guest_page, are there any other parts of KVM that may require updates so that guest_memfd can be used directly from the kernel? Patrick, does this help to answer the question of "how does KVM internally access guest_memfd for non-CoCo VMs" that you brought up in this other thread [*]? [*] https://lore.kernel.org/all/6bca3ad4-3eca-4a75-a775-5f8b0467d7a3@xxxxxxxxxxxx/ > > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx> > --- > virt/kvm/kvm_main.c | 137 ++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 118 insertions(+), 19 deletions(-) > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index aed9cf2f1685..77e6412034b9 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -3399,23 +3399,114 @@ int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end) > return kvm_gmem_toggle_mappable(kvm, start, end, false); > } > > +static int __kvm_read_guest_memfd_page(struct kvm *kvm, > + struct kvm_memory_slot *slot, > + gfn_t gfn, void *data, int offset, > + int len) > +{ > + struct page *page; > + u64 pfn; > + int r; > + > + /* > + * Holds the folio lock until after checking whether it can be faulted > + * in, to avoid races with paths that change a folio's mappability. > + */ > + r = kvm_gmem_get_pfn_locked(kvm, slot, gfn, &pfn, NULL); > + if (r) > + return r; > + > + page = pfn_to_page(pfn); > + > + if (!kvm_gmem_is_mappable(kvm, gfn, gfn + 1)) { > + r = -EPERM; > + goto unlock; > + } > + memcpy(data, page_address(page) + offset, len); > +unlock: > + if (r) > + put_page(page); > + else > + kvm_release_pfn_clean(pfn); > + unlock_page(page); > + > + return r; > +} > + > +static int __kvm_write_guest_memfd_page(struct kvm *kvm, > + struct kvm_memory_slot *slot, > + gfn_t gfn, const void *data, > + int offset, int len) > +{ > + struct page *page; > + u64 pfn; > + int r; > + > + /* > + * Holds the folio lock until after checking whether it can be faulted > + * in, to avoid races with paths that change a folio's mappability. > + */ > + r = kvm_gmem_get_pfn_locked(kvm, slot, gfn, &pfn, NULL); > + if (r) > + return r; > + > + page = pfn_to_page(pfn); > + > + if (!kvm_gmem_is_mappable(kvm, gfn, gfn + 1)) { > + r = -EPERM; > + goto unlock; > + } > + memcpy(page_address(page) + offset, data, len); > +unlock: > + if (r) > + put_page(page); > + else > + kvm_release_pfn_dirty(pfn); > + unlock_page(page); > + > + return r; > +} > +#else > +static int __kvm_read_guest_memfd_page(struct kvm *kvm, > + struct kvm_memory_slot *slot, > + gfn_t gfn, void *data, int offset, > + int len) > +{ > + WARN_ON_ONCE(1); > + return -EIO; > +} > + > +static int __kvm_write_guest_memfd_page(struct kvm *kvm, > + struct kvm_memory_slot *slot, > + gfn_t gfn, const void *data, > + int offset, int len) > +{ > + WARN_ON_ONCE(1); > + return -EIO; > +} > #endif /* CONFIG_KVM_GMEM_MAPPABLE */ > > /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ > -static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, > - void *data, int offset, int len) > + > +static int __kvm_read_guest_page(struct kvm *kvm, struct kvm_memory_slot *slot, > + gfn_t gfn, void *data, int offset, int len) > { > - int r; > unsigned long addr; > > if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) > return -EFAULT; > > + if (IS_ENABLED(CONFIG_KVM_GMEM_MAPPABLE) && > + kvm_slot_can_be_private(slot) && > + !slot->userspace_addr) { > + return __kvm_read_guest_memfd_page(kvm, slot, gfn, data, > + offset, len); > + } > + > addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); > if (kvm_is_error_hva(addr)) > return -EFAULT; > - r = __copy_from_user(data, (void __user *)addr + offset, len); > - if (r) > + if (__copy_from_user(data, (void __user *)addr + offset, len)) > return -EFAULT; > return 0; > } > @@ -3425,7 +3516,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, > { > struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); > > - return __kvm_read_guest_page(slot, gfn, data, offset, len); > + return __kvm_read_guest_page(kvm, slot, gfn, data, offset, len); > } > EXPORT_SYMBOL_GPL(kvm_read_guest_page); > > @@ -3434,7 +3525,7 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, > { > struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > > - return __kvm_read_guest_page(slot, gfn, data, offset, len); > + return __kvm_read_guest_page(vcpu->kvm, slot, gfn, data, offset, len); > } > EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); > > @@ -3511,22 +3602,30 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); > > /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ > static int __kvm_write_guest_page(struct kvm *kvm, > - struct kvm_memory_slot *memslot, gfn_t gfn, > - const void *data, int offset, int len) > + struct kvm_memory_slot *slot, gfn_t gfn, > + const void *data, int offset, int len) > { > - int r; > - unsigned long addr; > - > if (WARN_ON_ONCE(offset + len > PAGE_SIZE)) > return -EFAULT; > > - addr = gfn_to_hva_memslot(memslot, gfn); > - if (kvm_is_error_hva(addr)) > - return -EFAULT; > - r = __copy_to_user((void __user *)addr + offset, data, len); > - if (r) > - return -EFAULT; > - mark_page_dirty_in_slot(kvm, memslot, gfn); > + if (IS_ENABLED(CONFIG_KVM_GMEM_MAPPABLE) && > + kvm_slot_can_be_private(slot) && > + !slot->userspace_addr) { > + int r = __kvm_write_guest_memfd_page(kvm, slot, gfn, data, > + offset, len); > + > + if (r) > + return r; > + } else { > + unsigned long addr = gfn_to_hva_memslot(slot, gfn); > + > + if (kvm_is_error_hva(addr)) > + return -EFAULT; > + if (__copy_to_user((void __user *)addr + offset, data, len)) > + return -EFAULT; > + } > + > + mark_page_dirty_in_slot(kvm, slot, gfn); > return 0; > }