Re: [PATCH v2 1/6] mm: userfaultfd: add new UFFDIO_POISON ioctl

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jul 4, 2023 at 1:38 PM Peter Xu <peterx@xxxxxxxxxx> wrote:
>
> On Thu, Jun 29, 2023 at 01:50:35PM -0700, Axel Rasmussen wrote:
> > The basic idea here is to "simulate" memory poisoning for VMs. A VM
> > running on some host might encounter a memory error, after which some
> > page(s) are poisoned (i.e., future accesses SIGBUS). They expect that
> > once poisoned, pages can never become "un-poisoned". So, when we live
> > migrate the VM, we need to preserve the poisoned status of these pages.
> >
> > When live migrating, we try to get the guest running on its new host as
> > quickly as possible. So, we start it running before all memory has been
> > copied, and before we're certain which pages should be poisoned or not.
> >
> > So the basic way to use this new feature is:
> >
> > - On the new host, the guest's memory is registered with userfaultfd, in
> >   either MISSING or MINOR mode (doesn't really matter for this purpose).
> > - On any first access, we get a userfaultfd event. At this point we can
> >   communicate with the old host to find out if the page was poisoned.
> > - If so, we can respond with a UFFDIO_POISON - this places a swap marker
> >   so any future accesses will SIGBUS. Because the pte is now "present",
> >   future accesses won't generate more userfaultfd events, they'll just
> >   SIGBUS directly.
> >
> > UFFDIO_POISON does not handle unmapping previously-present PTEs. This
> > isn't needed, because during live migration we want to intercept
> > all accesses with userfaultfd (not just writes, so WP mode isn't useful
> > for this). So whether minor or missing mode is being used (or both), the
> > PTE won't be present in any case, so handling that case isn't needed.
> >
> > Why return VM_FAULT_HWPOISON instead of VM_FAULT_SIGBUS when one of
> > these markers is encountered? For "normal" userspace programs there
> > isn't a big difference, both yield a SIGBUS. The difference for KVM is
> > key though: VM_FAULT_HWPOISON will result in an MCE being injected into
> > the guest (which is the behavior we want). With VM_FAULT_SIGBUS, the
> > hypervisor would need to catch the SIGBUS and deal with the MCE
> > injection itself.
> >
> > Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
>
> Maybe have a cover letter for the next version?
>
> > ---
> >  fs/userfaultfd.c                 | 63 ++++++++++++++++++++++++++++++++
> >  include/linux/swapops.h          |  3 +-
> >  include/linux/userfaultfd_k.h    |  4 ++
> >  include/uapi/linux/userfaultfd.h | 25 +++++++++++--
> >  mm/memory.c                      |  4 ++
> >  mm/userfaultfd.c                 | 62 ++++++++++++++++++++++++++++++-
> >  6 files changed, 156 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index 7cecd49e078b..c26a883399c9 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -1965,6 +1965,66 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
> >       return ret;
> >  }
> >
> > +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
> > +{
> > +     __s64 ret;
> > +     struct uffdio_poison uffdio_poison;
> > +     struct uffdio_poison __user *user_uffdio_poison;
> > +     struct userfaultfd_wake_range range;
> > +
> > +     user_uffdio_poison = (struct uffdio_poison __user *)arg;
> > +
> > +     ret = -EAGAIN;
> > +     if (atomic_read(&ctx->mmap_changing))
> > +             goto out;
> > +
> > +     ret = -EFAULT;
> > +     if (copy_from_user(&uffdio_poison, user_uffdio_poison,
> > +                        /* don't copy the output fields */
> > +                        sizeof(uffdio_poison) - (sizeof(__s64))))
> > +             goto out;
> > +
> > +     ret = validate_range(ctx->mm, uffdio_poison.range.start,
> > +                          uffdio_poison.range.len);
> > +     if (ret)
> > +             goto out;
> > +
> > +     ret = -EINVAL;
> > +     /* double check for wraparound just in case. */
> > +     if (uffdio_poison.range.start + uffdio_poison.range.len <=
> > +         uffdio_poison.range.start) {
>
> Brackets not needed here.
>
> > +             goto out;
> > +     }
> > +     if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
> > +             goto out;
> > +
> > +     if (mmget_not_zero(ctx->mm)) {
> > +             ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
> > +                                       uffdio_poison.range.len,
> > +                                       &ctx->mmap_changing, 0);
> > +             mmput(ctx->mm);
> > +     } else {
> > +             return -ESRCH;
> > +     }
> > +
> > +     if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
> > +             return -EFAULT;
> > +     if (ret < 0)
> > +             goto out;
> > +
> > +     /* len == 0 would wake all */
> > +     BUG_ON(!ret);
> > +     range.len = ret;
> > +     if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
> > +             range.start = uffdio_poison.range.start;
> > +             wake_userfault(ctx, &range);
> > +     }
> > +     ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
> > +
> > +out:
> > +     return ret;
> > +}
> > +
> >  static inline unsigned int uffd_ctx_features(__u64 user_features)
> >  {
> >       /*
> > @@ -2066,6 +2126,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
> >       case UFFDIO_CONTINUE:
> >               ret = userfaultfd_continue(ctx, arg);
> >               break;
> > +     case UFFDIO_POISON:
> > +             ret = userfaultfd_poison(ctx, arg);
> > +             break;
> >       }
> >       return ret;
> >  }
> > diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> > index 4c932cb45e0b..8259fee32421 100644
> > --- a/include/linux/swapops.h
> > +++ b/include/linux/swapops.h
> > @@ -394,7 +394,8 @@ typedef unsigned long pte_marker;
> >
> >  #define  PTE_MARKER_UFFD_WP                  BIT(0)
> >  #define  PTE_MARKER_SWAPIN_ERROR             BIT(1)
> > -#define  PTE_MARKER_MASK                     (BIT(2) - 1)
> > +#define  PTE_MARKER_UFFD_POISON                      BIT(2)
>
> One more tab.
>
> Though I remembered the last time we discussed IIRC we plan to rename
> SWAPIN_ERROR and reuse it, could you explain why a new bit is still needed?
>
> I think I commented this but I'll do it again: IIUC any existing host
> swapin errors for guest pages should be reported as MCE too, afaict,
> happened in kvm context.

I think swapin errors are treated differently than poison. Swapin
errors get VM_FAULT_SIGBUS, and poison gets VM_FAULT_HWPOISON, so
UFFDIO_POISON should also get VM_FAULT_HWPOISON (so that's what Axel
has implemented). And I think that needs a separate PTE marker.

>
> > +#define  PTE_MARKER_MASK                     (BIT(3) - 1)
> >
> >  static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
> >  {
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index ac7b0c96d351..ac8c6854097c 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -46,6 +46,7 @@ enum mfill_atomic_mode {
> >       MFILL_ATOMIC_COPY,
> >       MFILL_ATOMIC_ZEROPAGE,
> >       MFILL_ATOMIC_CONTINUE,
> > +     MFILL_ATOMIC_POISON,
> >       NR_MFILL_ATOMIC_MODES,
> >  };
> >
> > @@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
> >  extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
> >                                    unsigned long len, atomic_t *mmap_changing,
> >                                    uffd_flags_t flags);
> > +extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> > +                                unsigned long len, atomic_t *mmap_changing,
> > +                                uffd_flags_t flags);
> >  extern int mwriteprotect_range(struct mm_struct *dst_mm,
> >                              unsigned long start, unsigned long len,
> >                              bool enable_wp, atomic_t *mmap_changing);
> > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> > index 66dd4cd277bd..62151706c5a3 100644
> > --- a/include/uapi/linux/userfaultfd.h
> > +++ b/include/uapi/linux/userfaultfd.h
> > @@ -39,7 +39,8 @@
> >                          UFFD_FEATURE_MINOR_SHMEM |           \
> >                          UFFD_FEATURE_EXACT_ADDRESS |         \
> >                          UFFD_FEATURE_WP_HUGETLBFS_SHMEM |    \
> > -                        UFFD_FEATURE_WP_UNPOPULATED)
> > +                        UFFD_FEATURE_WP_UNPOPULATED |        \
> > +                        UFFD_FEATURE_POISON)
> >  #define UFFD_API_IOCTLS                              \
> >       ((__u64)1 << _UFFDIO_REGISTER |         \
> >        (__u64)1 << _UFFDIO_UNREGISTER |       \
> > @@ -49,12 +50,14 @@
> >        (__u64)1 << _UFFDIO_COPY |             \
> >        (__u64)1 << _UFFDIO_ZEROPAGE |         \
> >        (__u64)1 << _UFFDIO_WRITEPROTECT |     \
> > -      (__u64)1 << _UFFDIO_CONTINUE)
> > +      (__u64)1 << _UFFDIO_CONTINUE |         \
> > +      (__u64)1 << _UFFDIO_POISON)
> >  #define UFFD_API_RANGE_IOCTLS_BASIC          \
> >       ((__u64)1 << _UFFDIO_WAKE |             \
> >        (__u64)1 << _UFFDIO_COPY |             \
> > +      (__u64)1 << _UFFDIO_WRITEPROTECT |     \
> >        (__u64)1 << _UFFDIO_CONTINUE |         \
> > -      (__u64)1 << _UFFDIO_WRITEPROTECT)
> > +      (__u64)1 << _UFFDIO_POISON)
>
> May not be a large deal, but it's still better to declare the feature &
> ioctls after all things implemented.  Maybe make these few lines
> (UFFD_API*, and the new feature bit) as the last patch to enable the
> feature?

I agree. Another option would be to have a separate feature for
UFFDIO_POISON for hugetlb, but I don't think we should do that. :)

>
> >
> >  /*
> >   * Valid ioctl command number range with this API is from 0x00 to
> > @@ -71,6 +74,7 @@
> >  #define _UFFDIO_ZEROPAGE             (0x04)
> >  #define _UFFDIO_WRITEPROTECT         (0x06)
> >  #define _UFFDIO_CONTINUE             (0x07)
> > +#define _UFFDIO_POISON                       (0x08)
> >  #define _UFFDIO_API                  (0x3F)
> >
> >  /* userfaultfd ioctl ids */
> > @@ -91,6 +95,8 @@
> >                                     struct uffdio_writeprotect)
> >  #define UFFDIO_CONTINUE              _IOWR(UFFDIO, _UFFDIO_CONTINUE, \
> >                                     struct uffdio_continue)
> > +#define UFFDIO_POISON                _IOWR(UFFDIO, _UFFDIO_POISON, \
> > +                                   struct uffdio_poison)
> >
> >  /* read() structure */
> >  struct uffd_msg {
> > @@ -225,6 +231,7 @@ struct uffdio_api {
> >  #define UFFD_FEATURE_EXACT_ADDRESS           (1<<11)
> >  #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM              (1<<12)
> >  #define UFFD_FEATURE_WP_UNPOPULATED          (1<<13)
> > +#define UFFD_FEATURE_POISON                  (1<<14)
> >       __u64 features;
> >
> >       __u64 ioctls;
> > @@ -321,6 +328,18 @@ struct uffdio_continue {
> >       __s64 mapped;
> >  };
> >
> > +struct uffdio_poison {
> > +     struct uffdio_range range;
> > +#define UFFDIO_POISON_MODE_DONTWAKE          ((__u64)1<<0)
> > +     __u64 mode;
> > +
> > +     /*
> > +      * Fields below here are written by the ioctl and must be at the end:
> > +      * the copy_from_user will not read past here.
> > +      */
> > +     __s64 updated;
> > +};
> > +
> >  /*
> >   * Flags for the userfaultfd(2) system call itself.
> >   */
> > diff --git a/mm/memory.c b/mm/memory.c
> > index d8a9a770b1f1..7fbda39e060d 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3692,6 +3692,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
> >       if (WARN_ON_ONCE(!marker))
> >               return VM_FAULT_SIGBUS;
> >
> > +     /* Poison emulation explicitly requested for this PTE. */
> > +     if (marker & PTE_MARKER_UFFD_POISON)
> > +             return VM_FAULT_HWPOISON;
> > +
> >       /* Higher priority than uffd-wp when data corrupted */
> >       if (marker & PTE_MARKER_SWAPIN_ERROR)
> >               return VM_FAULT_SIGBUS;
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index a2bf37ee276d..87b62ca1e09e 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -286,6 +286,51 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
> >       goto out;
> >  }
> >
> > +/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
> > +static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
> > +                                struct vm_area_struct *dst_vma,
> > +                                unsigned long dst_addr,
> > +                                uffd_flags_t flags)
> > +{
> > +     int ret;
> > +     struct mm_struct *dst_mm = dst_vma->vm_mm;
> > +     pte_t _dst_pte, *dst_pte;
> > +     spinlock_t *ptl;
> > +
> > +     _dst_pte = make_pte_marker(PTE_MARKER_UFFD_POISON);
> > +     dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> > +
> > +     if (vma_is_shmem(dst_vma)) {
> > +             struct inode *inode;
> > +             pgoff_t offset, max_off;
> > +
> > +             /* serialize against truncate with the page table lock */
> > +             inode = dst_vma->vm_file->f_inode;
> > +             offset = linear_page_index(dst_vma, dst_addr);
> > +             max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> > +             ret = -EFAULT;
> > +             if (unlikely(offset >= max_off))
> > +                     goto out_unlock;
> > +     }
>
> Maybe good chance to have a mfill_file_over_size() helper?  Other potential
> users are mfill_atomic_pte_zeropage() and mfill_atomic_install_pte().
>
> > +
> > +     ret = -EEXIST;
> > +     /*
> > +      * For now, we don't handle unmapping pages, so only support filling in
> > +      * none PTEs, or replacing PTE markers.
> > +      */
> > +     if (!pte_none_mostly(*dst_pte))
> > +             goto out_unlock;
> > +
> > +     set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> > +
> > +     /* No need to invalidate - it was non-present before */
> > +     update_mmu_cache(dst_vma, dst_addr, dst_pte);
> > +     ret = 0;
> > +out_unlock:
> > +     pte_unmap_unlock(dst_pte, ptl);
> > +     return ret;
> > +}
> > +
> >  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> >  {
> >       pgd_t *pgd;
> > @@ -336,8 +381,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> >        * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
> >        * by THP.  Since we can not reliably insert a zero page, this
> >        * feature is not supported.
> > +      *
> > +      * PTE marker handling for hugetlb is a bit special, so for now
> > +      * UFFDIO_POISON is not supported.
> >        */
> > -     if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> > +     if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) ||
> > +         uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
> >               mmap_read_unlock(dst_mm);
> >               return -EINVAL;
> >       }
> > @@ -481,6 +530,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
> >       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
> >               return mfill_atomic_pte_continue(dst_pmd, dst_vma,
> >                                                dst_addr, flags);
> > +     } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
> > +             return mfill_atomic_pte_poison(dst_pmd, dst_vma,
> > +                                            dst_addr, flags);
> >       }
> >
> >       /*
> > @@ -702,6 +754,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
> >                           uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
> >  }
> >
> > +ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> > +                         unsigned long len, atomic_t *mmap_changing,
> > +                         uffd_flags_t flags)
> > +{
> > +     return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> > +                         uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
> > +}
> > +
> >  long uffd_wp_range(struct vm_area_struct *dst_vma,
> >                  unsigned long start, unsigned long len, bool enable_wp)
> >  {
> > --
> > 2.41.0.255.g8b1d071c50-goog
> >
>
> --
> Peter Xu
>




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux