Re: [PATCH v3 04/11] KVM: guest_memfd: Allow host to mmap guest_memfd() pages when shared

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Elliot,

On Mon, 14 Oct 2024 at 17:53, Elliot Berman <quic_eberman@xxxxxxxxxxx> wrote:
>
> On Thu, Oct 10, 2024 at 09:59:23AM +0100, Fuad Tabba wrote:
> > Add support for mmap() and fault() for guest_memfd in the host.
> > The ability to fault in a guest page is contingent on that page
> > being shared with the host.
> >
> > The guest_memfd PRIVATE memory attribute is not used for two
> > reasons. First because it reflects the userspace expectation for
> > that memory location, and therefore can be toggled by userspace.
> > The second is, although each guest_memfd file has a 1:1 binding
> > with a KVM instance, the plan is to allow multiple files per
> > inode, e.g. to allow intra-host migration to a new KVM instance,
> > without destroying guest_memfd.
> >
> > The mapping is restricted to only memory explicitly shared with
> > the host. KVM checks that the host doesn't have any mappings for
> > private memory via the folio's refcount. To avoid races between
> > paths that check mappability and paths that check whether the
> > host has any mappings (via the refcount), the folio lock is held
> > in while either check is being performed.
> >
> > This new feature is gated with a new configuration option,
> > CONFIG_KVM_GMEM_MAPPABLE.
> >
> > Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> > Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> > Co-developed-by: Elliot Berman <quic_eberman@xxxxxxxxxxx>
> > Signed-off-by: Elliot Berman <quic_eberman@xxxxxxxxxxx>
> > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
> >
> > ---
> >
> > Note that the functions kvm_gmem_is_mapped(),
> > kvm_gmem_set_mappable(), and int kvm_gmem_clear_mappable() are
> > not used in this patch series. They are intended to be used in
> > future patches [*], which check and toggle mapability when the
> > guest shares/unshares pages with the host.
> >
> > [*] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/guestmem-6.12-v3-pkvm
> >
> > ---
> >  include/linux/kvm_host.h |  52 +++++++++++
> >  virt/kvm/Kconfig         |   4 +
> >  virt/kvm/guest_memfd.c   | 185 +++++++++++++++++++++++++++++++++++++++
> >  virt/kvm/kvm_main.c      | 138 +++++++++++++++++++++++++++++
> >  4 files changed, 379 insertions(+)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index acf85995b582..bda7fda9945e 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -2527,4 +2527,56 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> >                                   struct kvm_pre_fault_memory *range);
> >  #endif
> >
> > +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> > +bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end);
> > +bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end);
> > +int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end);
> > +int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end);
> > +int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start,
> > +                            gfn_t end);
> > +int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start,
> > +                              gfn_t end);
> > +bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn);
> > +#else
> > +static inline bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end)
> > +{
> > +     WARN_ON_ONCE(1);
> > +     return false;
> > +}
> > +static inline bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +     WARN_ON_ONCE(1);
> > +     return false;
> > +}
> > +static inline int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +     WARN_ON_ONCE(1);
> > +     return -EINVAL;
> > +}
> > +static inline int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start,
> > +                                       gfn_t end)
> > +{
> > +     WARN_ON_ONCE(1);
> > +     return -EINVAL;
> > +}
> > +static inline int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot,
> > +                                          gfn_t start, gfn_t end)
> > +{
> > +     WARN_ON_ONCE(1);
> > +     return -EINVAL;
> > +}
> > +static inline int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot,
> > +                                            gfn_t start, gfn_t end)
> > +{
> > +     WARN_ON_ONCE(1);
> > +     return -EINVAL;
> > +}
> > +static inline bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot,
> > +                                          gfn_t gfn)
> > +{
> > +     WARN_ON_ONCE(1);
> > +     return false;
> > +}
> > +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> > +
> >  #endif
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index fd6a3010afa8..2cfcb0848e37 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -120,3 +120,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
> >  config HAVE_KVM_ARCH_GMEM_INVALIDATE
> >         bool
> >         depends on KVM_PRIVATE_MEM
> > +
> > +config KVM_GMEM_MAPPABLE
> > +       select KVM_PRIVATE_MEM
> > +       bool
> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > index f414646c475b..df3a6f05a16e 100644
> > --- a/virt/kvm/guest_memfd.c
> > +++ b/virt/kvm/guest_memfd.c
> > @@ -370,7 +370,184 @@ static void kvm_gmem_init_mount(void)
> >       kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
> >  }
> >
> > +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> > +static struct folio *
> > +__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
> > +                gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared,
> > +                int *max_order);
> > +
> > +static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> > +{
> > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > +     void *xval = xa_mk_value(true);
> > +     pgoff_t i;
> > +     bool r;
> > +
> > +     filemap_invalidate_lock(inode->i_mapping);
> > +     for (i = start; i < end; i++) {
> > +             r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL));
>
> I think it might not be strictly necessary,

Sorry, but I don't quite get what isn't strictly necessary. Is it the
checking for an error?

> > +             if (r)
> > +                     break;
> > +     }
> > +     filemap_invalidate_unlock(inode->i_mapping);
> > +
> > +     return r;
> > +}
> > +
> > +static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> > +{
> > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > +     pgoff_t i;
> > +     int r = 0;
> > +
> > +     filemap_invalidate_lock(inode->i_mapping);
> > +     for (i = start; i < end; i++) {
> > +             struct folio *folio;
> > +
> > +             /*
> > +              * Holds the folio lock until after checking its refcount,
> > +              * to avoid races with paths that fault in the folio.
> > +              */
> > +             folio = kvm_gmem_get_folio(inode, i);
>
> We don't need to allocate the folio here. I think we can use
>
>                 folio = filemap_lock_folio(inode, i);
>                 if (!folio || WARN_ON_ONCE(IS_ERR(folio)))
>                         continue;

Good point (it takes an inode->i_mapping though).

>                 folio = filemap_lock_folio(inode->i_mapping, i);


> > +             if (WARN_ON_ONCE(IS_ERR(folio)))
> > +                     continue;
> > +
> > +             /*
> > +              * Check that the host doesn't have any mappings on clearing
> > +              * the mappable flag, because clearing the flag implies that the
> > +              * memory will be unshared from the host. Therefore, to maintain
> > +              * the invariant that the host cannot access private memory, we
> > +              * need to check that it doesn't have any mappings to that
> > +              * memory before making it private.
> > +              *
> > +              * Two references are expected because of kvm_gmem_get_folio().
> > +              */
> > +             if (folio_ref_count(folio) > 2)
>
> If we'd like to be prepared for large folios, it should be
> folio_nr_pages(folio) + 1.

Will do that.

Thanks!
/fuad



> > +                     r = -EPERM;
> > +             else
> > +                     xa_erase(mappable_offsets, i);
> > +
> > +             folio_put(folio);
> > +             folio_unlock(folio);
> > +
> > +             if (r)
> > +                     break;
> > +     }
> > +     filemap_invalidate_unlock(inode->i_mapping);
> > +
> > +     return r;
> > +}
> > +
> > +static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff)
> > +{
> > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > +     bool r;
> > +
> > +     filemap_invalidate_lock_shared(inode->i_mapping);
> > +     r = xa_find(mappable_offsets, &pgoff, pgoff, XA_PRESENT);
> > +     filemap_invalidate_unlock_shared(inode->i_mapping);
> > +
> > +     return r;
> > +}
> > +
> > +int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> > +{
> > +     struct inode *inode = file_inode(slot->gmem.file);
> > +     pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
> > +     pgoff_t end_off = start_off + end - start;
> > +
> > +     return gmem_set_mappable(inode, start_off, end_off);
> > +}
> > +
> > +int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> > +{
> > +     struct inode *inode = file_inode(slot->gmem.file);
> > +     pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
> > +     pgoff_t end_off = start_off + end - start;
> > +
> > +     return gmem_clear_mappable(inode, start_off, end_off);
> > +}
> > +
> > +bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn)
> > +{
> > +     struct inode *inode = file_inode(slot->gmem.file);
> > +     unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
> > +
> > +     return gmem_is_mappable(inode, pgoff);
> > +}
> > +
> > +static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
> > +{
> > +     struct inode *inode = file_inode(vmf->vma->vm_file);
> > +     struct folio *folio;
> > +     vm_fault_t ret = VM_FAULT_LOCKED;
> > +
> > +     /*
> > +      * Holds the folio lock until after checking whether it can be faulted
> > +      * in, to avoid races with paths that change a folio's mappability.
> > +      */
> > +     folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> > +     if (!folio)
> > +             return VM_FAULT_SIGBUS;
> > +
> > +     if (folio_test_hwpoison(folio)) {
> > +             ret = VM_FAULT_HWPOISON;
> > +             goto out;
> > +     }
> > +
> > +     if (!gmem_is_mappable(inode, vmf->pgoff)) {
> > +             ret = VM_FAULT_SIGBUS;
> > +             goto out;
> > +     }
> > +
> > +     if (!folio_test_uptodate(folio)) {
> > +             unsigned long nr_pages = folio_nr_pages(folio);
> > +             unsigned long i;
> > +
> > +             for (i = 0; i < nr_pages; i++)
> > +                     clear_highpage(folio_page(folio, i));
> > +
> > +             folio_mark_uptodate(folio);
> > +     }
> > +
> > +     vmf->page = folio_file_page(folio, vmf->pgoff);
> > +out:
> > +     if (ret != VM_FAULT_LOCKED) {
> > +             folio_put(folio);
> > +             folio_unlock(folio);
> > +     }
> > +
> > +     return ret;
> > +}
> > +
> > +static const struct vm_operations_struct kvm_gmem_vm_ops = {
> > +     .fault = kvm_gmem_fault,
> > +};
> > +
> > +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +     if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
> > +         (VM_SHARED | VM_MAYSHARE)) {
> > +             return -EINVAL;
> > +     }
> > +
> > +     file_accessed(file);
> > +     vm_flags_set(vma, VM_DONTDUMP);
> > +     vma->vm_ops = &kvm_gmem_vm_ops;
> > +
> > +     return 0;
> > +}
> > +#else
> > +static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> > +{
> > +     WARN_ON_ONCE(1);
> > +     return -EINVAL;
> > +}
> > +#define kvm_gmem_mmap NULL
> > +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> > +
> >  static struct file_operations kvm_gmem_fops = {
> > +     .mmap           = kvm_gmem_mmap,
> >       .open           = generic_file_open,
> >       .release        = kvm_gmem_release,
> >       .fallocate      = kvm_gmem_fallocate,
> > @@ -557,6 +734,14 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> >               goto err_gmem;
> >       }
> >
> > +     if (IS_ENABLED(CONFIG_KVM_GMEM_MAPPABLE)) {
> > +             err = gmem_set_mappable(file_inode(file), 0, size >> PAGE_SHIFT);
> > +             if (err) {
> > +                     fput(file);
> > +                     goto err_gmem;
> > +             }
> > +     }
> > +
> >       kvm_get_kvm(kvm);
> >       gmem->kvm = kvm;
> >       xa_init(&gmem->bindings);
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 05cbb2548d99..aed9cf2f1685 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -3263,6 +3263,144 @@ static int next_segment(unsigned long len, int offset)
> >               return len;
> >  }
> >
> > +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> > +static bool __kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +     struct kvm_memslot_iter iter;
> > +
> > +     lockdep_assert_held(&kvm->slots_lock);
> > +
> > +     kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> > +             struct kvm_memory_slot *memslot = iter.slot;
> > +             gfn_t gfn_start, gfn_end, i;
> > +
> > +             gfn_start = max(start, memslot->base_gfn);
> > +             gfn_end = min(end, memslot->base_gfn + memslot->npages);
> > +             if (WARN_ON_ONCE(gfn_start >= gfn_end))
> > +                     continue;
> > +
> > +             for (i = gfn_start; i < gfn_end; i++) {
> > +                     if (!kvm_slot_gmem_is_mappable(memslot, i))
> > +                             return false;
> > +             }
> > +     }
> > +
> > +     return true;
> > +}
> > +
> > +bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +     bool r;
> > +
> > +     mutex_lock(&kvm->slots_lock);
> > +     r = __kvm_gmem_is_mappable(kvm, start, end);
> > +     mutex_unlock(&kvm->slots_lock);
> > +
> > +     return r;
> > +}
> > +
> > +static bool kvm_gmem_is_pfn_mapped(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn_idx)
> > +{
> > +     struct page *page;
> > +     bool is_mapped;
> > +     kvm_pfn_t pfn;
> > +
> > +     /*
> > +      * Holds the folio lock until after checking its refcount,
> > +      * to avoid races with paths that fault in the folio.
> > +      */
> > +     if (WARN_ON_ONCE(kvm_gmem_get_pfn_locked(kvm, memslot, gfn_idx, &pfn, NULL)))
> > +             return false;
> > +
> > +     page = pfn_to_page(pfn);
> > +
> > +     /* Two references are expected because of kvm_gmem_get_pfn_locked(). */
> > +     is_mapped = page_ref_count(page) > 2;
> > +
> > +     put_page(page);
> > +     unlock_page(page);
> > +
> > +     return is_mapped;
> > +}
> > +
> > +static bool __kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +     struct kvm_memslot_iter iter;
> > +
> > +     lockdep_assert_held(&kvm->slots_lock);
> > +
> > +     kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> > +             struct kvm_memory_slot *memslot = iter.slot;
> > +             gfn_t gfn_start, gfn_end, i;
> > +
> > +             gfn_start = max(start, memslot->base_gfn);
> > +             gfn_end = min(end, memslot->base_gfn + memslot->npages);
> > +             if (WARN_ON_ONCE(gfn_start >= gfn_end))
> > +                     continue;
> > +
> > +             for (i = gfn_start; i < gfn_end; i++) {
> > +                     if (kvm_gmem_is_pfn_mapped(kvm, memslot, i))
> > +                             return true;
> > +             }
> > +     }
> > +
> > +     return false;
> > +}
> > +
> > +bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +     bool r;
> > +
> > +     mutex_lock(&kvm->slots_lock);
> > +     r = __kvm_gmem_is_mapped(kvm, start, end);
> > +     mutex_unlock(&kvm->slots_lock);
> > +
> > +     return r;
> > +}
> > +
> > +static int kvm_gmem_toggle_mappable(struct kvm *kvm, gfn_t start, gfn_t end,
> > +                                 bool is_mappable)
> > +{
> > +     struct kvm_memslot_iter iter;
> > +     int r = 0;
> > +
> > +     mutex_lock(&kvm->slots_lock);
> > +
> > +     kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> > +             struct kvm_memory_slot *memslot = iter.slot;
> > +             gfn_t gfn_start, gfn_end;
> > +
> > +             gfn_start = max(start, memslot->base_gfn);
> > +             gfn_end = min(end, memslot->base_gfn + memslot->npages);
> > +             if (WARN_ON_ONCE(start >= end))
> > +                     continue;
> > +
> > +             if (is_mappable)
> > +                     r = kvm_slot_gmem_set_mappable(memslot, gfn_start, gfn_end);
> > +             else
> > +                     r = kvm_slot_gmem_clear_mappable(memslot, gfn_start, gfn_end);
> > +
> > +             if (WARN_ON_ONCE(r))
> > +                     break;
> > +     }
> > +
> > +     mutex_unlock(&kvm->slots_lock);
> > +
> > +     return r;
> > +}
> > +
> > +int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +     return kvm_gmem_toggle_mappable(kvm, start, end, true);
> > +}
> > +
> > +int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > +{
> > +     return kvm_gmem_toggle_mappable(kvm, start, end, false);
> > +}
> > +
> > +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> > +
> >  /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
> >  static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
> >                                void *data, int offset, int len)
> > --
> > 2.47.0.rc0.187.ge670bccf7e-goog
> >




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [Linux for Sparc]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux