Re: [PATCH v2 1/6] mm: userfaultfd: add new UFFDIO_POISON ioctl

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Jun 29, 2023 at 01:50:35PM -0700, Axel Rasmussen wrote:
> The basic idea here is to "simulate" memory poisoning for VMs. A VM
> running on some host might encounter a memory error, after which some
> page(s) are poisoned (i.e., future accesses SIGBUS). They expect that
> once poisoned, pages can never become "un-poisoned". So, when we live
> migrate the VM, we need to preserve the poisoned status of these pages.
> 
> When live migrating, we try to get the guest running on its new host as
> quickly as possible. So, we start it running before all memory has been
> copied, and before we're certain which pages should be poisoned or not.
> 
> So the basic way to use this new feature is:
> 
> - On the new host, the guest's memory is registered with userfaultfd, in
>   either MISSING or MINOR mode (doesn't really matter for this purpose).
> - On any first access, we get a userfaultfd event. At this point we can
>   communicate with the old host to find out if the page was poisoned.
> - If so, we can respond with a UFFDIO_POISON - this places a swap marker
>   so any future accesses will SIGBUS. Because the pte is now "present",
>   future accesses won't generate more userfaultfd events, they'll just
>   SIGBUS directly.
> 
> UFFDIO_POISON does not handle unmapping previously-present PTEs. This
> isn't needed, because during live migration we want to intercept
> all accesses with userfaultfd (not just writes, so WP mode isn't useful
> for this). So whether minor or missing mode is being used (or both), the
> PTE won't be present in any case, so handling that case isn't needed.
> 
> Why return VM_FAULT_HWPOISON instead of VM_FAULT_SIGBUS when one of
> these markers is encountered? For "normal" userspace programs there
> isn't a big difference, both yield a SIGBUS. The difference for KVM is
> key though: VM_FAULT_HWPOISON will result in an MCE being injected into
> the guest (which is the behavior we want). With VM_FAULT_SIGBUS, the
> hypervisor would need to catch the SIGBUS and deal with the MCE
> injection itself.
> 
> Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>

Maybe have a cover letter for the next version?

> ---
>  fs/userfaultfd.c                 | 63 ++++++++++++++++++++++++++++++++
>  include/linux/swapops.h          |  3 +-
>  include/linux/userfaultfd_k.h    |  4 ++
>  include/uapi/linux/userfaultfd.h | 25 +++++++++++--
>  mm/memory.c                      |  4 ++
>  mm/userfaultfd.c                 | 62 ++++++++++++++++++++++++++++++-
>  6 files changed, 156 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 7cecd49e078b..c26a883399c9 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1965,6 +1965,66 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>  	return ret;
>  }
>  
> +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
> +{
> +	__s64 ret;
> +	struct uffdio_poison uffdio_poison;
> +	struct uffdio_poison __user *user_uffdio_poison;
> +	struct userfaultfd_wake_range range;
> +
> +	user_uffdio_poison = (struct uffdio_poison __user *)arg;
> +
> +	ret = -EAGAIN;
> +	if (atomic_read(&ctx->mmap_changing))
> +		goto out;
> +
> +	ret = -EFAULT;
> +	if (copy_from_user(&uffdio_poison, user_uffdio_poison,
> +			   /* don't copy the output fields */
> +			   sizeof(uffdio_poison) - (sizeof(__s64))))
> +		goto out;
> +
> +	ret = validate_range(ctx->mm, uffdio_poison.range.start,
> +			     uffdio_poison.range.len);
> +	if (ret)
> +		goto out;
> +
> +	ret = -EINVAL;
> +	/* double check for wraparound just in case. */
> +	if (uffdio_poison.range.start + uffdio_poison.range.len <=
> +	    uffdio_poison.range.start) {

Brackets not needed here.

> +		goto out;
> +	}
> +	if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
> +		goto out;
> +
> +	if (mmget_not_zero(ctx->mm)) {
> +		ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
> +					  uffdio_poison.range.len,
> +					  &ctx->mmap_changing, 0);
> +		mmput(ctx->mm);
> +	} else {
> +		return -ESRCH;
> +	}
> +
> +	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
> +		return -EFAULT;
> +	if (ret < 0)
> +		goto out;
> +
> +	/* len == 0 would wake all */
> +	BUG_ON(!ret);
> +	range.len = ret;
> +	if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
> +		range.start = uffdio_poison.range.start;
> +		wake_userfault(ctx, &range);
> +	}
> +	ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
> +
> +out:
> +	return ret;
> +}
> +
>  static inline unsigned int uffd_ctx_features(__u64 user_features)
>  {
>  	/*
> @@ -2066,6 +2126,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
>  	case UFFDIO_CONTINUE:
>  		ret = userfaultfd_continue(ctx, arg);
>  		break;
> +	case UFFDIO_POISON:
> +		ret = userfaultfd_poison(ctx, arg);
> +		break;
>  	}
>  	return ret;
>  }
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index 4c932cb45e0b..8259fee32421 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -394,7 +394,8 @@ typedef unsigned long pte_marker;
>  
>  #define  PTE_MARKER_UFFD_WP			BIT(0)
>  #define  PTE_MARKER_SWAPIN_ERROR		BIT(1)
> -#define  PTE_MARKER_MASK			(BIT(2) - 1)
> +#define  PTE_MARKER_UFFD_POISON			BIT(2)

One more tab.

Though I remembered the last time we discussed IIRC we plan to rename
SWAPIN_ERROR and reuse it, could you explain why a new bit is still needed?

I think I commented this but I'll do it again: IIUC any existing host
swapin errors for guest pages should be reported as MCE too, afaict,
happened in kvm context.

> +#define  PTE_MARKER_MASK			(BIT(3) - 1)
>  
>  static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
>  {
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index ac7b0c96d351..ac8c6854097c 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -46,6 +46,7 @@ enum mfill_atomic_mode {
>  	MFILL_ATOMIC_COPY,
>  	MFILL_ATOMIC_ZEROPAGE,
>  	MFILL_ATOMIC_CONTINUE,
> +	MFILL_ATOMIC_POISON,
>  	NR_MFILL_ATOMIC_MODES,
>  };
>  
> @@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
>  extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
>  				     unsigned long len, atomic_t *mmap_changing,
>  				     uffd_flags_t flags);
> +extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> +				   unsigned long len, atomic_t *mmap_changing,
> +				   uffd_flags_t flags);
>  extern int mwriteprotect_range(struct mm_struct *dst_mm,
>  			       unsigned long start, unsigned long len,
>  			       bool enable_wp, atomic_t *mmap_changing);
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 66dd4cd277bd..62151706c5a3 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -39,7 +39,8 @@
>  			   UFFD_FEATURE_MINOR_SHMEM |		\
>  			   UFFD_FEATURE_EXACT_ADDRESS |		\
>  			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM |	\
> -			   UFFD_FEATURE_WP_UNPOPULATED)
> +			   UFFD_FEATURE_WP_UNPOPULATED |	\
> +			   UFFD_FEATURE_POISON)
>  #define UFFD_API_IOCTLS				\
>  	((__u64)1 << _UFFDIO_REGISTER |		\
>  	 (__u64)1 << _UFFDIO_UNREGISTER |	\
> @@ -49,12 +50,14 @@
>  	 (__u64)1 << _UFFDIO_COPY |		\
>  	 (__u64)1 << _UFFDIO_ZEROPAGE |		\
>  	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
> -	 (__u64)1 << _UFFDIO_CONTINUE)
> +	 (__u64)1 << _UFFDIO_CONTINUE |		\
> +	 (__u64)1 << _UFFDIO_POISON)
>  #define UFFD_API_RANGE_IOCTLS_BASIC		\
>  	((__u64)1 << _UFFDIO_WAKE |		\
>  	 (__u64)1 << _UFFDIO_COPY |		\
> +	 (__u64)1 << _UFFDIO_WRITEPROTECT |	\
>  	 (__u64)1 << _UFFDIO_CONTINUE |		\
> -	 (__u64)1 << _UFFDIO_WRITEPROTECT)
> +	 (__u64)1 << _UFFDIO_POISON)

May not be a large deal, but it's still better to declare the feature &
ioctls after all things implemented.  Maybe make these few lines
(UFFD_API*, and the new feature bit) as the last patch to enable the
feature?

>  
>  /*
>   * Valid ioctl command number range with this API is from 0x00 to
> @@ -71,6 +74,7 @@
>  #define _UFFDIO_ZEROPAGE		(0x04)
>  #define _UFFDIO_WRITEPROTECT		(0x06)
>  #define _UFFDIO_CONTINUE		(0x07)
> +#define _UFFDIO_POISON			(0x08)
>  #define _UFFDIO_API			(0x3F)
>  
>  /* userfaultfd ioctl ids */
> @@ -91,6 +95,8 @@
>  				      struct uffdio_writeprotect)
>  #define UFFDIO_CONTINUE		_IOWR(UFFDIO, _UFFDIO_CONTINUE,	\
>  				      struct uffdio_continue)
> +#define UFFDIO_POISON		_IOWR(UFFDIO, _UFFDIO_POISON, \
> +				      struct uffdio_poison)
>  
>  /* read() structure */
>  struct uffd_msg {
> @@ -225,6 +231,7 @@ struct uffdio_api {
>  #define UFFD_FEATURE_EXACT_ADDRESS		(1<<11)
>  #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM		(1<<12)
>  #define UFFD_FEATURE_WP_UNPOPULATED		(1<<13)
> +#define UFFD_FEATURE_POISON			(1<<14)
>  	__u64 features;
>  
>  	__u64 ioctls;
> @@ -321,6 +328,18 @@ struct uffdio_continue {
>  	__s64 mapped;
>  };
>  
> +struct uffdio_poison {
> +	struct uffdio_range range;
> +#define UFFDIO_POISON_MODE_DONTWAKE		((__u64)1<<0)
> +	__u64 mode;
> +
> +	/*
> +	 * Fields below here are written by the ioctl and must be at the end:
> +	 * the copy_from_user will not read past here.
> +	 */
> +	__s64 updated;
> +};
> +
>  /*
>   * Flags for the userfaultfd(2) system call itself.
>   */
> diff --git a/mm/memory.c b/mm/memory.c
> index d8a9a770b1f1..7fbda39e060d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3692,6 +3692,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
>  	if (WARN_ON_ONCE(!marker))
>  		return VM_FAULT_SIGBUS;
>  
> +	/* Poison emulation explicitly requested for this PTE. */
> +	if (marker & PTE_MARKER_UFFD_POISON)
> +		return VM_FAULT_HWPOISON;
> +
>  	/* Higher priority than uffd-wp when data corrupted */
>  	if (marker & PTE_MARKER_SWAPIN_ERROR)
>  		return VM_FAULT_SIGBUS;
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index a2bf37ee276d..87b62ca1e09e 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -286,6 +286,51 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
>  	goto out;
>  }
>  
> +/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
> +static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
> +				   struct vm_area_struct *dst_vma,
> +				   unsigned long dst_addr,
> +				   uffd_flags_t flags)
> +{
> +	int ret;
> +	struct mm_struct *dst_mm = dst_vma->vm_mm;
> +	pte_t _dst_pte, *dst_pte;
> +	spinlock_t *ptl;
> +
> +	_dst_pte = make_pte_marker(PTE_MARKER_UFFD_POISON);
> +	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
> +
> +	if (vma_is_shmem(dst_vma)) {
> +		struct inode *inode;
> +		pgoff_t offset, max_off;
> +
> +		/* serialize against truncate with the page table lock */
> +		inode = dst_vma->vm_file->f_inode;
> +		offset = linear_page_index(dst_vma, dst_addr);
> +		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
> +		ret = -EFAULT;
> +		if (unlikely(offset >= max_off))
> +			goto out_unlock;
> +	}

Maybe good chance to have a mfill_file_over_size() helper?  Other potential
users are mfill_atomic_pte_zeropage() and mfill_atomic_install_pte().

> +
> +	ret = -EEXIST;
> +	/*
> +	 * For now, we don't handle unmapping pages, so only support filling in
> +	 * none PTEs, or replacing PTE markers.
> +	 */
> +	if (!pte_none_mostly(*dst_pte))
> +		goto out_unlock;
> +
> +	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> +
> +	/* No need to invalidate - it was non-present before */
> +	update_mmu_cache(dst_vma, dst_addr, dst_pte);
> +	ret = 0;
> +out_unlock:
> +	pte_unmap_unlock(dst_pte, ptl);
> +	return ret;
> +}
> +
>  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
>  {
>  	pgd_t *pgd;
> @@ -336,8 +381,12 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
>  	 * by THP.  Since we can not reliably insert a zero page, this
>  	 * feature is not supported.
> +	 *
> +	 * PTE marker handling for hugetlb is a bit special, so for now
> +	 * UFFDIO_POISON is not supported.
>  	 */
> -	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> +	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) ||
> +	    uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
>  		mmap_read_unlock(dst_mm);
>  		return -EINVAL;
>  	}
> @@ -481,6 +530,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
>  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
>  		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
>  						 dst_addr, flags);
> +	} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
> +		return mfill_atomic_pte_poison(dst_pmd, dst_vma,
> +					       dst_addr, flags);
>  	}
>  
>  	/*
> @@ -702,6 +754,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
>  }
>  
> +ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> +			    unsigned long len, atomic_t *mmap_changing,
> +			    uffd_flags_t flags)
> +{
> +	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
> +}
> +
>  long uffd_wp_range(struct vm_area_struct *dst_vma,
>  		   unsigned long start, unsigned long len, bool enable_wp)
>  {
> -- 
> 2.41.0.255.g8b1d071c50-goog
> 

-- 
Peter Xu




[Index of Archives]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Share Photos]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Samba]     [Device Mapper]

  Powered by Linux