IOMMUFD calls get_user_pages() for every mapping which will allocate shared memory instead of using private memory managed by the KVM and MEMFD. Add support for IOMMUFD fd to the VFIO KVM device's KVM_DEV_VFIO_FILE API similar to already existing VFIO device and VFIO group fds. This addition registers the KVM in IOMMUFD with a callback to get a pfn for guest private memory for mapping it later in the IOMMU. No callback for free as it is generic folio_put() for now. The aforementioned callback uses uptr to calculate the offset into the KVM memory slot and find private backing pfn, copies kvm_gmem_get_pfn() pretty much. This relies on private pages to be pinned beforehand. Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxx> --- drivers/iommu/iommufd/io_pagetable.h | 3 + drivers/iommu/iommufd/iommufd_private.h | 4 + include/linux/iommufd.h | 6 ++ include/linux/kvm_host.h | 66 ++++++++++++++ drivers/iommu/iommufd/io_pagetable.c | 2 + drivers/iommu/iommufd/main.c | 21 +++++ drivers/iommu/iommufd/pages.c | 94 +++++++++++++++++--- virt/kvm/guest_memfd.c | 40 +++++++++ virt/kvm/vfio.c | 58 ++++++++++-- 9 files changed, 275 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 0ec3509b7e33..fc9239fc94c0 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -204,6 +204,9 @@ struct iopt_pages { struct rb_root_cached access_itree; /* Of iopt_area::pages_node */ struct rb_root_cached domains_itree; + + struct kvm *kvm; + gmem_pin_t gmem_pin; }; struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 92efe30a8f0d..bd5573ddcd9c 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -10,6 +10,7 @@ #include <linux/uaccess.h> #include <linux/iommu.h> #include <linux/iova_bitmap.h> +#include <linux/iommufd.h> #include <uapi/linux/iommufd.h> #include "../iommu-priv.h" @@ -28,6 +29,9 @@ struct iommufd_ctx { /* Compatibility with VFIO no iommu */ u8 no_iommu_mode; struct iommufd_ioas *vfio_ioas; + + struct kvm *kvm; + gmem_pin_t gmem_pin; }; /* diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index ffc3a949f837..a990f604c044 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/err.h> +#include <linux/kvm_types.h> struct device; struct iommufd_device; @@ -57,6 +58,11 @@ void iommufd_ctx_get(struct iommufd_ctx *ictx); #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); struct iommufd_ctx *iommufd_ctx_from_fd(int fd); +bool iommufd_file_is_valid(struct file *file); +typedef int (*gmem_pin_t)(struct kvm *kvm, void __user *uptr, gfn_t *gfn, + kvm_pfn_t *pfn, int *max_order); +void iommufd_file_set_kvm(struct file *file, struct kvm *kvm, + gmem_pin_t gmem_pin); void iommufd_ctx_put(struct iommufd_ctx *ictx); bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fdb331b3e0d3..a09a346ba3ca 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1297,6 +1297,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +struct kvm_memory_slot *uptr_to_memslot(struct kvm *kvm, void __user *uptr); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); @@ -1713,6 +1714,22 @@ try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn) return NULL; } +static inline struct kvm_memory_slot * +try_get_memslot_uptr(struct kvm_memory_slot *slot, void __user *uptr) +{ + unsigned long base_upn; + unsigned long upn = (unsigned long) uptr >> PAGE_SHIFT; + + if (!slot) + return NULL; + + base_upn = slot->userspace_addr >> PAGE_SHIFT; + if (upn >= base_upn && upn < base_upn + slot->npages) + return slot; + else + return NULL; +} + /* * Returns a pointer to the memslot that contains gfn. Otherwise returns NULL. * @@ -1741,6 +1758,22 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool approx) return approx ? slot : NULL; } +static inline struct kvm_memory_slot * +search_memslots_uptr(struct kvm_memslots *slots, void __user *uptr) +{ + unsigned long upn = (unsigned long) uptr >> PAGE_SHIFT; + struct kvm_memslot_iter iter; + + kvm_for_each_memslot_in_gfn_range(&iter, slots, 0, 512ULL * SZ_1T) { + struct kvm_memory_slot *slot = iter.slot; + unsigned long base_upn = slot->userspace_addr >> PAGE_SHIFT; + + if (upn >= base_upn && upn < base_upn + slot->npages) + return slot; + } + return NULL; +} + static inline struct kvm_memory_slot * ____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx) { @@ -1760,6 +1793,25 @@ ____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx) return NULL; } +static inline struct kvm_memory_slot * +____uptr_to_memslot(struct kvm_memslots *slots, void __user *uptr) +{ + struct kvm_memory_slot *slot; + + slot = (struct kvm_memory_slot *)atomic_long_read(&slots->last_used_slot); + slot = try_get_memslot_uptr(slot, uptr); + if (slot) + return slot; + + slot = search_memslots_uptr(slots, uptr); + if (slot) { + atomic_long_set(&slots->last_used_slot, (unsigned long)slot); + return slot; + } + + return NULL; +} + /* * __gfn_to_memslot() and its descendants are here to allow arch code to inline * the lookups in hot paths. gfn_to_memslot() itself isn't here as an inline @@ -1771,6 +1823,12 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) return ____gfn_to_memslot(slots, gfn, false); } +static inline struct kvm_memory_slot * +__uptr_to_memslot(struct kvm_memslots *slots, void __user *uptr) +{ + return ____uptr_to_memslot(slots, uptr); +} + static inline unsigned long __gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { @@ -2446,6 +2504,8 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order); +int kvm_gmem_uptr_to_pfn(struct kvm *kvm, void __user *uptr, gfn_t *gfn, + kvm_pfn_t *pfn, int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, @@ -2454,6 +2514,12 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, KVM_BUG_ON(1, kvm); return -EIO; } +static inline int kvm_gmem_uptr_to_pfn(struct kvm *kvm, void __user *uptr, gfn_t *gfn, + kvm_pfn_t *pfn, int *max_order) +{ + KVM_BUG_ON(1, kvm); + return -EIO; +} #endif /* CONFIG_KVM_PRIVATE_MEM */ #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 05fd9d3abf1b..aa7584d4a2b8 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -412,6 +412,8 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; elm.start_byte = uptr - elm.pages->uptr; elm.length = length; + elm.pages->kvm = ictx->kvm; + elm.pages->gmem_pin = ictx->gmem_pin; list_add(&elm.next, &pages_list); rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 83bbd7c5d160..b6039f7c1cce 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -17,6 +17,7 @@ #include <linux/bug.h> #include <uapi/linux/iommufd.h> #include <linux/iommufd.h> +#include <linux/kvm_host.h> #include "io_pagetable.h" #include "iommufd_private.h" @@ -488,6 +489,26 @@ struct iommufd_ctx *iommufd_ctx_from_fd(int fd) } EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); +bool iommufd_file_is_valid(struct file *file) +{ + return file->f_op == &iommufd_fops; +} +EXPORT_SYMBOL_NS_GPL(iommufd_file_is_valid, IOMMUFD); + +void iommufd_file_set_kvm(struct file *file, struct kvm *kvm, gmem_pin_t gmem_pin) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(file); + + if (WARN_ON(!ictx)) + return; + + ictx->kvm = kvm; + ictx->gmem_pin = gmem_pin; + + iommufd_ctx_put(ictx); +} +EXPORT_SYMBOL_NS_GPL(iommufd_file_set_kvm, IOMMUFD); + /** * iommufd_ctx_put - Put back a reference * @ictx: Context to put back diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 117f644a0c5b..d85b6969d9ea 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -52,6 +52,8 @@ #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/iommufd.h> +#include <linux/kvm_host.h> +#include <linux/pagemap.h> #include "io_pagetable.h" #include "double_span.h" @@ -622,6 +624,33 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages, break; } +static void memfd_unpin_user_page_range_dirty_lock(struct page *page, + unsigned long npages, + bool make_dirty) +{ + unsigned long i, nr; + + for (i = 0; i < npages; i += nr) { + struct page *next = nth_page(page, i); + struct folio *folio = page_folio(next); + + if (folio_test_large(folio)) + nr = min_t(unsigned int, npages - i, + folio_nr_pages(folio) - + folio_page_idx(folio, next)); + else + nr = 1; + + if (make_dirty && !folio_test_dirty(folio)) { + // FIXME: do we need this? private memory does not swap + folio_lock(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + } + folio_put(folio); + } +} + static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, unsigned int first_page_off, size_t npages) { @@ -638,9 +667,14 @@ static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, size_t to_unpin = min_t(size_t, npages, batch->npfns[cur] - first_page_off); - unpin_user_page_range_dirty_lock( - pfn_to_page(batch->pfns[cur] + first_page_off), - to_unpin, pages->writable); + if (pages->kvm) + memfd_unpin_user_page_range_dirty_lock( + pfn_to_page(batch->pfns[cur] + first_page_off), + to_unpin, pages->writable); + else + unpin_user_page_range_dirty_lock( + pfn_to_page(batch->pfns[cur] + first_page_off), + to_unpin, pages->writable); iopt_pages_sub_npinned(pages, to_unpin); cur++; first_page_off = 0; @@ -777,17 +811,51 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, return -EFAULT; uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); - if (!remote_mm) - rc = pin_user_pages_fast(uptr, npages, user->gup_flags, - user->upages); - else { - if (!user->locked) { - mmap_read_lock(pages->source_mm); - user->locked = 1; + + if (pages->kvm) { + if (WARN_ON(!pages->gmem_pin)) + return -EFAULT; + + rc = 0; + for (unsigned long i = 0; i < npages; ++i, uptr += PAGE_SIZE) { + gfn_t gfn = 0; + kvm_pfn_t pfn = 0; + int max_order = 0, rc1; + + rc1 = pages->gmem_pin(pages->kvm, (void *) uptr, + &gfn, &pfn, &max_order); + if (rc1 == -EINVAL && i == 0) { + pr_err_once("Must be vfio mmio at gfn=%llx pfn=%llx, skipping\n", + gfn, pfn); + goto the_usual; + } + + if (rc1) { + pr_err("%s: %d %ld %lx -> %lx\n", __func__, + rc1, i, (unsigned long) uptr, (unsigned long) pfn); + rc = rc1; + break; + } + + user->upages[i] = pfn_to_page(pfn); + } + + if (!rc) + rc = npages; + } else { +the_usual: + if (!remote_mm) { + rc = pin_user_pages_fast(uptr, npages, user->gup_flags, + user->upages); + } else { + if (!user->locked) { + mmap_read_lock(pages->source_mm); + user->locked = 1; + } + rc = pin_user_pages_remote(pages->source_mm, uptr, npages, + user->gup_flags, user->upages, + &user->locked); } - rc = pin_user_pages_remote(pages->source_mm, uptr, npages, - user->gup_flags, user->upages, - &user->locked); } if (rc <= 0) { if (WARN_ON(!rc)) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e930014b4bdc..07ff561208fd 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -659,6 +659,46 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, return folio; } +int kvm_gmem_uptr_to_pfn(struct kvm *kvm, void __user *uptr, gfn_t *gfn, + kvm_pfn_t *pfn, int *max_order) +{ + struct kvm_memory_slot *slot = __uptr_to_memslot(kvm_memslots(kvm), + uptr); + bool is_prepared = false; + unsigned long upn_off; + struct folio *folio; + struct file *file; + int r; + + if (!slot) + return -EFAULT; + + file = kvm_gmem_get_file(slot); + if (!file) + return -EFAULT; + + upn_off = ((unsigned long) uptr - slot->userspace_addr) >> PAGE_SHIFT; + *gfn = slot->base_gfn + upn_off; + + folio = __kvm_gmem_get_pfn(file, slot, *gfn, pfn, &is_prepared, max_order, true); + if (IS_ERR(folio)) { + r = PTR_ERR(folio); + goto out; + } + + if (!is_prepared) + r = kvm_gmem_prepare_folio(kvm, slot, *gfn, folio); + + folio_unlock(folio); + if (r < 0) + folio_put(folio); + +out: + fput(file); + return r; +} +EXPORT_SYMBOL_GPL(kvm_gmem_uptr_to_pfn); + int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index a4e9db212adc..7c1d859a58e8 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -16,6 +16,7 @@ #include <linux/uaccess.h> #include <linux/vfio.h> #include <linux/tsm.h> +#include <linux/iommufd.h> #include "vfio.h" #ifdef CONFIG_SPAPR_TCE_IOMMU @@ -25,6 +26,7 @@ struct kvm_vfio_file { struct list_head node; struct file *file; + bool is_iommufd; #ifdef CONFIG_SPAPR_TCE_IOMMU struct iommu_group *iommu_group; #endif @@ -87,6 +89,36 @@ static bool kvm_vfio_file_is_valid(struct file *file) return ret; } +static bool kvm_iommufd_file_is_valid(struct file *file) +{ + bool (*fn)(struct file *file); + bool ret; + + fn = symbol_get(iommufd_file_is_valid); + if (!fn) + return false; + + ret = fn(file); + + symbol_put(iommufd_file_is_valid); + + return ret; +} + +static void kvm_iommufd_file_set_kvm(struct file *file, struct kvm *kvm, + gmem_pin_t gmem_pin) +{ + void (*fn)(struct file *file, struct kvm *kvm, gmem_pin_t gmem_pin); + + fn = symbol_get(iommufd_file_set_kvm); + if (!fn) + return; + + fn(file, kvm, gmem_pin); + + symbol_put(iommufd_file_set_kvm); +} + static struct vfio_device *kvm_vfio_file_device(struct file *file) { struct vfio_device *(*fn)(struct file *file); @@ -167,7 +199,7 @@ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf; - struct file *filp; + struct file *filp = NULL; int ret = 0; filp = fget(fd); @@ -175,7 +207,7 @@ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) return -EBADF; /* Ensure the FD is a vfio FD. */ - if (!kvm_vfio_file_is_valid(filp)) { + if (!kvm_vfio_file_is_valid(filp) && !kvm_iommufd_file_is_valid(filp)) { ret = -EINVAL; goto out_fput; } @@ -196,11 +228,18 @@ static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) } kvf->file = get_file(filp); + list_add_tail(&kvf->node, &kv->file_list); kvm_arch_start_assignment(dev->kvm); - kvm_vfio_file_set_kvm(kvf->file, dev->kvm); - kvm_vfio_update_coherency(dev); + kvf->is_iommufd = kvm_iommufd_file_is_valid(filp); + + if (kvf->is_iommufd) { + kvm_iommufd_file_set_kvm(kvf->file, dev->kvm, kvm_gmem_uptr_to_pfn); + } else { + kvm_vfio_file_set_kvm(kvf->file, dev->kvm); + kvm_vfio_update_coherency(dev); + } out_unlock: mutex_unlock(&kv->lock); @@ -233,7 +272,11 @@ static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd) #ifdef CONFIG_SPAPR_TCE_IOMMU kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif - kvm_vfio_file_set_kvm(kvf->file, NULL); + if (kvf->is_iommufd) + kvm_iommufd_file_set_kvm(kvf->file, NULL, NULL); + else + kvm_vfio_file_set_kvm(kvf->file, NULL); + fput(kvf->file); kfree(kvf); ret = 0; @@ -476,7 +519,10 @@ static void kvm_vfio_release(struct kvm_device *dev) #ifdef CONFIG_SPAPR_TCE_IOMMU kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif - kvm_vfio_file_set_kvm(kvf->file, NULL); + if (kvf->is_iommufd) + kvm_iommufd_file_set_kvm(kvf->file, NULL, NULL); + else + kvm_vfio_file_set_kvm(kvf->file, NULL); fput(kvf->file); list_del(&kvf->node); kfree(kvf); -- 2.45.2