Re: [PATCH v3 04/11] KVM: guest_memfd: Allow host to mmap guest_memfd() pages when shared

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Oct 15, 2024 at 11:27:48AM +0100, Fuad Tabba wrote:
> Hi Elliot,
> 
> On Mon, 14 Oct 2024 at 17:53, Elliot Berman <quic_eberman@xxxxxxxxxxx> wrote:
> >
> > On Thu, Oct 10, 2024 at 09:59:23AM +0100, Fuad Tabba wrote:
> > > Add support for mmap() and fault() for guest_memfd in the host.
> > > The ability to fault in a guest page is contingent on that page
> > > being shared with the host.
> > >
> > > The guest_memfd PRIVATE memory attribute is not used for two
> > > reasons. First because it reflects the userspace expectation for
> > > that memory location, and therefore can be toggled by userspace.
> > > The second is, although each guest_memfd file has a 1:1 binding
> > > with a KVM instance, the plan is to allow multiple files per
> > > inode, e.g. to allow intra-host migration to a new KVM instance,
> > > without destroying guest_memfd.
> > >
> > > The mapping is restricted to only memory explicitly shared with
> > > the host. KVM checks that the host doesn't have any mappings for
> > > private memory via the folio's refcount. To avoid races between
> > > paths that check mappability and paths that check whether the
> > > host has any mappings (via the refcount), the folio lock is held
> > > in while either check is being performed.
> > >
> > > This new feature is gated with a new configuration option,
> > > CONFIG_KVM_GMEM_MAPPABLE.
> > >
> > > Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> > > Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> > > Co-developed-by: Elliot Berman <quic_eberman@xxxxxxxxxxx>
> > > Signed-off-by: Elliot Berman <quic_eberman@xxxxxxxxxxx>
> > > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
> > >
> > > ---
> > >
> > > Note that the functions kvm_gmem_is_mapped(),
> > > kvm_gmem_set_mappable(), and int kvm_gmem_clear_mappable() are
> > > not used in this patch series. They are intended to be used in
> > > future patches [*], which check and toggle mapability when the
> > > guest shares/unshares pages with the host.
> > >
> > > [*] https://android-kvm.googlesource.com/linux/+/refs/heads/tabba/guestmem-6.12-v3-pkvm
> > >
> > > ---
> > >  include/linux/kvm_host.h |  52 +++++++++++
> > >  virt/kvm/Kconfig         |   4 +
> > >  virt/kvm/guest_memfd.c   | 185 +++++++++++++++++++++++++++++++++++++++
> > >  virt/kvm/kvm_main.c      | 138 +++++++++++++++++++++++++++++
> > >  4 files changed, 379 insertions(+)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index acf85995b582..bda7fda9945e 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -2527,4 +2527,56 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
> > >                                   struct kvm_pre_fault_memory *range);
> > >  #endif
> > >
> > > +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> > > +bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end);
> > > +bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end);
> > > +int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end);
> > > +int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end);
> > > +int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start,
> > > +                            gfn_t end);
> > > +int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start,
> > > +                              gfn_t end);
> > > +bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn);
> > > +#else
> > > +static inline bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t gfn, gfn_t end)
> > > +{
> > > +     WARN_ON_ONCE(1);
> > > +     return false;
> > > +}
> > > +static inline bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +     WARN_ON_ONCE(1);
> > > +     return false;
> > > +}
> > > +static inline int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +     WARN_ON_ONCE(1);
> > > +     return -EINVAL;
> > > +}
> > > +static inline int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start,
> > > +                                       gfn_t end)
> > > +{
> > > +     WARN_ON_ONCE(1);
> > > +     return -EINVAL;
> > > +}
> > > +static inline int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot,
> > > +                                          gfn_t start, gfn_t end)
> > > +{
> > > +     WARN_ON_ONCE(1);
> > > +     return -EINVAL;
> > > +}
> > > +static inline int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot,
> > > +                                            gfn_t start, gfn_t end)
> > > +{
> > > +     WARN_ON_ONCE(1);
> > > +     return -EINVAL;
> > > +}
> > > +static inline bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot,
> > > +                                          gfn_t gfn)
> > > +{
> > > +     WARN_ON_ONCE(1);
> > > +     return false;
> > > +}
> > > +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> > > +
> > >  #endif
> > > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > > index fd6a3010afa8..2cfcb0848e37 100644
> > > --- a/virt/kvm/Kconfig
> > > +++ b/virt/kvm/Kconfig
> > > @@ -120,3 +120,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
> > >  config HAVE_KVM_ARCH_GMEM_INVALIDATE
> > >         bool
> > >         depends on KVM_PRIVATE_MEM
> > > +
> > > +config KVM_GMEM_MAPPABLE
> > > +       select KVM_PRIVATE_MEM
> > > +       bool
> > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > > index f414646c475b..df3a6f05a16e 100644
> > > --- a/virt/kvm/guest_memfd.c
> > > +++ b/virt/kvm/guest_memfd.c
> > > @@ -370,7 +370,184 @@ static void kvm_gmem_init_mount(void)
> > >       kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
> > >  }
> > >
> > > +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> > > +static struct folio *
> > > +__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
> > > +                gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared,
> > > +                int *max_order);
> > > +
> > > +static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> > > +{
> > > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > > +     void *xval = xa_mk_value(true);
> > > +     pgoff_t i;
> > > +     bool r;
> > > +
> > > +     filemap_invalidate_lock(inode->i_mapping);
> > > +     for (i = start; i < end; i++) {
> > > +             r = xa_err(xa_store(mappable_offsets, i, xval, GFP_KERNEL));
> >
> > I think it might not be strictly necessary,
> 
> Sorry, but I don't quite get what isn't strictly necessary. Is it the
> checking for an error?
> 


Oops, I was thinking we need to check the folio_ref_count when setting
the ref_count. I'd started replying, then realized doing the check isn't
necessary. I missed deleting the start of my comment, sorry about that
:)

> > > +             if (r)
> > > +                     break;
> > > +     }
> > > +     filemap_invalidate_unlock(inode->i_mapping);
> > > +
> > > +     return r;
> > > +}
> > > +
> > > +static int gmem_clear_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> > > +{
> > > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > > +     pgoff_t i;
> > > +     int r = 0;
> > > +
> > > +     filemap_invalidate_lock(inode->i_mapping);
> > > +     for (i = start; i < end; i++) {
> > > +             struct folio *folio;
> > > +
> > > +             /*
> > > +              * Holds the folio lock until after checking its refcount,
> > > +              * to avoid races with paths that fault in the folio.
> > > +              */
> > > +             folio = kvm_gmem_get_folio(inode, i);
> >
> > We don't need to allocate the folio here. I think we can use
> >
> >                 folio = filemap_lock_folio(inode, i);
> >                 if (!folio || WARN_ON_ONCE(IS_ERR(folio)))
> >                         continue;
> 
> Good point (it takes an inode->i_mapping though).
> 
> >                 folio = filemap_lock_folio(inode->i_mapping, i);
> 
> 
> > > +             if (WARN_ON_ONCE(IS_ERR(folio)))
> > > +                     continue;
> > > +
> > > +             /*
> > > +              * Check that the host doesn't have any mappings on clearing
> > > +              * the mappable flag, because clearing the flag implies that the
> > > +              * memory will be unshared from the host. Therefore, to maintain
> > > +              * the invariant that the host cannot access private memory, we
> > > +              * need to check that it doesn't have any mappings to that
> > > +              * memory before making it private.
> > > +              *
> > > +              * Two references are expected because of kvm_gmem_get_folio().
> > > +              */
> > > +             if (folio_ref_count(folio) > 2)
> >
> > If we'd like to be prepared for large folios, it should be
> > folio_nr_pages(folio) + 1.
> 
> Will do that.
> 
> Thanks!
> /fuad
> 
> 
> 
> > > +                     r = -EPERM;
> > > +             else
> > > +                     xa_erase(mappable_offsets, i);
> > > +
> > > +             folio_put(folio);
> > > +             folio_unlock(folio);
> > > +
> > > +             if (r)
> > > +                     break;
> > > +     }
> > > +     filemap_invalidate_unlock(inode->i_mapping);
> > > +
> > > +     return r;
> > > +}
> > > +
> > > +static bool gmem_is_mappable(struct inode *inode, pgoff_t pgoff)
> > > +{
> > > +     struct xarray *mappable_offsets = &kvm_gmem_private(inode)->mappable_offsets;
> > > +     bool r;
> > > +
> > > +     filemap_invalidate_lock_shared(inode->i_mapping);
> > > +     r = xa_find(mappable_offsets, &pgoff, pgoff, XA_PRESENT);
> > > +     filemap_invalidate_unlock_shared(inode->i_mapping);
> > > +
> > > +     return r;
> > > +}
> > > +
> > > +int kvm_slot_gmem_set_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> > > +{
> > > +     struct inode *inode = file_inode(slot->gmem.file);
> > > +     pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
> > > +     pgoff_t end_off = start_off + end - start;
> > > +
> > > +     return gmem_set_mappable(inode, start_off, end_off);
> > > +}
> > > +
> > > +int kvm_slot_gmem_clear_mappable(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> > > +{
> > > +     struct inode *inode = file_inode(slot->gmem.file);
> > > +     pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
> > > +     pgoff_t end_off = start_off + end - start;
> > > +
> > > +     return gmem_clear_mappable(inode, start_off, end_off);
> > > +}
> > > +
> > > +bool kvm_slot_gmem_is_mappable(struct kvm_memory_slot *slot, gfn_t gfn)
> > > +{
> > > +     struct inode *inode = file_inode(slot->gmem.file);
> > > +     unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
> > > +
> > > +     return gmem_is_mappable(inode, pgoff);
> > > +}
> > > +
> > > +static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
> > > +{
> > > +     struct inode *inode = file_inode(vmf->vma->vm_file);
> > > +     struct folio *folio;
> > > +     vm_fault_t ret = VM_FAULT_LOCKED;
> > > +
> > > +     /*
> > > +      * Holds the folio lock until after checking whether it can be faulted
> > > +      * in, to avoid races with paths that change a folio's mappability.
> > > +      */
> > > +     folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> > > +     if (!folio)
> > > +             return VM_FAULT_SIGBUS;
> > > +
> > > +     if (folio_test_hwpoison(folio)) {
> > > +             ret = VM_FAULT_HWPOISON;
> > > +             goto out;
> > > +     }
> > > +
> > > +     if (!gmem_is_mappable(inode, vmf->pgoff)) {
> > > +             ret = VM_FAULT_SIGBUS;
> > > +             goto out;
> > > +     }
> > > +
> > > +     if (!folio_test_uptodate(folio)) {
> > > +             unsigned long nr_pages = folio_nr_pages(folio);
> > > +             unsigned long i;
> > > +
> > > +             for (i = 0; i < nr_pages; i++)
> > > +                     clear_highpage(folio_page(folio, i));
> > > +
> > > +             folio_mark_uptodate(folio);
> > > +     }
> > > +
> > > +     vmf->page = folio_file_page(folio, vmf->pgoff);
> > > +out:
> > > +     if (ret != VM_FAULT_LOCKED) {
> > > +             folio_put(folio);
> > > +             folio_unlock(folio);
> > > +     }
> > > +
> > > +     return ret;
> > > +}
> > > +
> > > +static const struct vm_operations_struct kvm_gmem_vm_ops = {
> > > +     .fault = kvm_gmem_fault,
> > > +};
> > > +
> > > +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> > > +{
> > > +     if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
> > > +         (VM_SHARED | VM_MAYSHARE)) {
> > > +             return -EINVAL;
> > > +     }
> > > +
> > > +     file_accessed(file);
> > > +     vm_flags_set(vma, VM_DONTDUMP);
> > > +     vma->vm_ops = &kvm_gmem_vm_ops;
> > > +
> > > +     return 0;
> > > +}
> > > +#else
> > > +static int gmem_set_mappable(struct inode *inode, pgoff_t start, pgoff_t end)
> > > +{
> > > +     WARN_ON_ONCE(1);
> > > +     return -EINVAL;
> > > +}
> > > +#define kvm_gmem_mmap NULL
> > > +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> > > +
> > >  static struct file_operations kvm_gmem_fops = {
> > > +     .mmap           = kvm_gmem_mmap,
> > >       .open           = generic_file_open,
> > >       .release        = kvm_gmem_release,
> > >       .fallocate      = kvm_gmem_fallocate,
> > > @@ -557,6 +734,14 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> > >               goto err_gmem;
> > >       }
> > >
> > > +     if (IS_ENABLED(CONFIG_KVM_GMEM_MAPPABLE)) {
> > > +             err = gmem_set_mappable(file_inode(file), 0, size >> PAGE_SHIFT);
> > > +             if (err) {
> > > +                     fput(file);
> > > +                     goto err_gmem;
> > > +             }
> > > +     }
> > > +
> > >       kvm_get_kvm(kvm);
> > >       gmem->kvm = kvm;
> > >       xa_init(&gmem->bindings);
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 05cbb2548d99..aed9cf2f1685 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -3263,6 +3263,144 @@ static int next_segment(unsigned long len, int offset)
> > >               return len;
> > >  }
> > >
> > > +#ifdef CONFIG_KVM_GMEM_MAPPABLE
> > > +static bool __kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +     struct kvm_memslot_iter iter;
> > > +
> > > +     lockdep_assert_held(&kvm->slots_lock);
> > > +
> > > +     kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> > > +             struct kvm_memory_slot *memslot = iter.slot;
> > > +             gfn_t gfn_start, gfn_end, i;
> > > +
> > > +             gfn_start = max(start, memslot->base_gfn);
> > > +             gfn_end = min(end, memslot->base_gfn + memslot->npages);
> > > +             if (WARN_ON_ONCE(gfn_start >= gfn_end))
> > > +                     continue;
> > > +
> > > +             for (i = gfn_start; i < gfn_end; i++) {
> > > +                     if (!kvm_slot_gmem_is_mappable(memslot, i))
> > > +                             return false;
> > > +             }
> > > +     }
> > > +
> > > +     return true;
> > > +}
> > > +
> > > +bool kvm_gmem_is_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +     bool r;
> > > +
> > > +     mutex_lock(&kvm->slots_lock);
> > > +     r = __kvm_gmem_is_mappable(kvm, start, end);
> > > +     mutex_unlock(&kvm->slots_lock);
> > > +
> > > +     return r;
> > > +}
> > > +
> > > +static bool kvm_gmem_is_pfn_mapped(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn_idx)
> > > +{
> > > +     struct page *page;
> > > +     bool is_mapped;
> > > +     kvm_pfn_t pfn;
> > > +
> > > +     /*
> > > +      * Holds the folio lock until after checking its refcount,
> > > +      * to avoid races with paths that fault in the folio.
> > > +      */
> > > +     if (WARN_ON_ONCE(kvm_gmem_get_pfn_locked(kvm, memslot, gfn_idx, &pfn, NULL)))
> > > +             return false;
> > > +
> > > +     page = pfn_to_page(pfn);
> > > +
> > > +     /* Two references are expected because of kvm_gmem_get_pfn_locked(). */
> > > +     is_mapped = page_ref_count(page) > 2;
> > > +
> > > +     put_page(page);
> > > +     unlock_page(page);
> > > +
> > > +     return is_mapped;
> > > +}
> > > +
> > > +static bool __kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +     struct kvm_memslot_iter iter;
> > > +
> > > +     lockdep_assert_held(&kvm->slots_lock);
> > > +
> > > +     kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> > > +             struct kvm_memory_slot *memslot = iter.slot;
> > > +             gfn_t gfn_start, gfn_end, i;
> > > +
> > > +             gfn_start = max(start, memslot->base_gfn);
> > > +             gfn_end = min(end, memslot->base_gfn + memslot->npages);
> > > +             if (WARN_ON_ONCE(gfn_start >= gfn_end))
> > > +                     continue;
> > > +
> > > +             for (i = gfn_start; i < gfn_end; i++) {
> > > +                     if (kvm_gmem_is_pfn_mapped(kvm, memslot, i))
> > > +                             return true;
> > > +             }
> > > +     }
> > > +
> > > +     return false;
> > > +}
> > > +
> > > +bool kvm_gmem_is_mapped(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +     bool r;
> > > +
> > > +     mutex_lock(&kvm->slots_lock);
> > > +     r = __kvm_gmem_is_mapped(kvm, start, end);
> > > +     mutex_unlock(&kvm->slots_lock);
> > > +
> > > +     return r;
> > > +}
> > > +
> > > +static int kvm_gmem_toggle_mappable(struct kvm *kvm, gfn_t start, gfn_t end,
> > > +                                 bool is_mappable)
> > > +{
> > > +     struct kvm_memslot_iter iter;
> > > +     int r = 0;
> > > +
> > > +     mutex_lock(&kvm->slots_lock);
> > > +
> > > +     kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> > > +             struct kvm_memory_slot *memslot = iter.slot;
> > > +             gfn_t gfn_start, gfn_end;
> > > +
> > > +             gfn_start = max(start, memslot->base_gfn);
> > > +             gfn_end = min(end, memslot->base_gfn + memslot->npages);
> > > +             if (WARN_ON_ONCE(start >= end))
> > > +                     continue;
> > > +
> > > +             if (is_mappable)
> > > +                     r = kvm_slot_gmem_set_mappable(memslot, gfn_start, gfn_end);
> > > +             else
> > > +                     r = kvm_slot_gmem_clear_mappable(memslot, gfn_start, gfn_end);
> > > +
> > > +             if (WARN_ON_ONCE(r))
> > > +                     break;
> > > +     }
> > > +
> > > +     mutex_unlock(&kvm->slots_lock);
> > > +
> > > +     return r;
> > > +}
> > > +
> > > +int kvm_gmem_set_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +     return kvm_gmem_toggle_mappable(kvm, start, end, true);
> > > +}
> > > +
> > > +int kvm_gmem_clear_mappable(struct kvm *kvm, gfn_t start, gfn_t end)
> > > +{
> > > +     return kvm_gmem_toggle_mappable(kvm, start, end, false);
> > > +}
> > > +
> > > +#endif /* CONFIG_KVM_GMEM_MAPPABLE */
> > > +
> > >  /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
> > >  static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
> > >                                void *data, int offset, int len)
> > > --
> > > 2.47.0.rc0.187.ge670bccf7e-goog
> > >




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux