Re: [PATCH v2 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jan 29, 2024 at 11:35:11AM -0800, Lokesh Gidra wrote:
> Increments and loads to mmap_changing are always in mmap_lock
> critical section. This ensures that if userspace requests event
> notification for non-cooperative operations (e.g. mremap), userfaultfd
> operations don't occur concurrently.
> 
> This can be achieved by using a separate read-write semaphore in
> userfaultfd_ctx such that increments are done in write-mode and loads
> in read-mode, thereby eliminating the dependency on mmap_lock for this
> purpose.
> 
> This is a preparatory step before we replace mmap_lock usage with
> per-vma locks in fill/move ioctls.
> 
> Signed-off-by: Lokesh Gidra <lokeshgidra@xxxxxxxxxx>

Reviewed-by: Mike Rapoport (IBM) <rppt@xxxxxxxxxx>

> ---
>  fs/userfaultfd.c              | 40 ++++++++++++----------
>  include/linux/userfaultfd_k.h | 31 ++++++++++--------
>  mm/userfaultfd.c              | 62 ++++++++++++++++++++---------------
>  3 files changed, 75 insertions(+), 58 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 58331b83d648..c00a021bcce4 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
>  		ctx->flags = octx->flags;
>  		ctx->features = octx->features;
>  		ctx->released = false;
> +		init_rwsem(&ctx->map_changing_lock);
>  		atomic_set(&ctx->mmap_changing, 0);
>  		ctx->mm = vma->vm_mm;
>  		mmgrab(ctx->mm);
>  
>  		userfaultfd_ctx_get(octx);
> +		down_write(&octx->map_changing_lock);
>  		atomic_inc(&octx->mmap_changing);
> +		up_write(&octx->map_changing_lock);
>  		fctx->orig = octx;
>  		fctx->new = ctx;
>  		list_add_tail(&fctx->list, fcs);
> @@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
>  	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
>  		vm_ctx->ctx = ctx;
>  		userfaultfd_ctx_get(ctx);
> +		down_write(&ctx->map_changing_lock);
>  		atomic_inc(&ctx->mmap_changing);
> +		up_write(&ctx->map_changing_lock);
>  	} else {
>  		/* Drop uffd context if remap feature not enabled */
>  		vma_start_write(vma);
> @@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
>  		return true;
>  
>  	userfaultfd_ctx_get(ctx);
> +	down_write(&ctx->map_changing_lock);
>  	atomic_inc(&ctx->mmap_changing);
> +	up_write(&ctx->map_changing_lock);
>  	mmap_read_unlock(mm);
>  
>  	msg_init(&ewq.msg);
> @@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
>  		return -ENOMEM;
>  
>  	userfaultfd_ctx_get(ctx);
> +	down_write(&ctx->map_changing_lock);
>  	atomic_inc(&ctx->mmap_changing);
> +	up_write(&ctx->map_changing_lock);
>  	unmap_ctx->ctx = ctx;
>  	unmap_ctx->start = start;
>  	unmap_ctx->end = end;
> @@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>  	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
>  		flags |= MFILL_ATOMIC_WP;
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
> -					uffdio_copy.len, &ctx->mmap_changing,
> -					flags);
> +		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
> +					uffdio_copy.len, flags);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>  		goto out;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
> -					   uffdio_zeropage.range.len,
> -					   &ctx->mmap_changing);
> +		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
> +					   uffdio_zeropage.range.len);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
>  		return -EINVAL;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> -					  uffdio_wp.range.len, mode_wp,
> -					  &ctx->mmap_changing);
> +		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
> +					  uffdio_wp.range.len, mode_wp);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>  		flags |= MFILL_ATOMIC_WP;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
> -					    uffdio_continue.range.len,
> -					    &ctx->mmap_changing, flags);
> +		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
> +					    uffdio_continue.range.len, flags);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
>  		goto out;
>  
>  	if (mmget_not_zero(ctx->mm)) {
> -		ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
> -					  uffdio_poison.range.len,
> -					  &ctx->mmap_changing, 0);
> +		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
> +					  uffdio_poison.range.len, 0);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> @@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
>  	if (mmget_not_zero(mm)) {
>  		mmap_read_lock(mm);
>  
> -		/* Re-check after taking mmap_lock */
> +		/* Re-check after taking map_changing_lock */
> +		down_read(&ctx->map_changing_lock);
>  		if (likely(!atomic_read(&ctx->mmap_changing)))
>  			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
>  					 uffdio_move.len, uffdio_move.mode);
>  		else
>  			ret = -EAGAIN;
> -
> +		up_read(&ctx->map_changing_lock);
>  		mmap_read_unlock(mm);
>  		mmput(mm);
>  	} else {
> @@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags)
>  	ctx->flags = flags;
>  	ctx->features = 0;
>  	ctx->released = false;
> +	init_rwsem(&ctx->map_changing_lock);
>  	atomic_set(&ctx->mmap_changing, 0);
>  	ctx->mm = current->mm;
>  	/* prevent the mm struct to be freed */
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 691d928ee864..3210c3552976 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -69,6 +69,13 @@ struct userfaultfd_ctx {
>  	unsigned int features;
>  	/* released */
>  	bool released;
> +	/*
> +	 * Prevents userfaultfd operations (fill/move/wp) from happening while
> +	 * some non-cooperative event(s) is taking place. Increments are done
> +	 * in write-mode. Whereas, userfaultfd operations, which includes
> +	 * reading mmap_changing, is done under read-mode.
> +	 */
> +	struct rw_semaphore map_changing_lock;
>  	/* memory mappings are changing because of non-cooperative event */
>  	atomic_t mmap_changing;
>  	/* mm with one ore more vmas attached to this userfaultfd_ctx */
> @@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
>  				    unsigned long dst_addr, struct page *page,
>  				    bool newly_allocated, uffd_flags_t flags);
>  
> -extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
> +extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
>  				 unsigned long src_start, unsigned long len,
> -				 atomic_t *mmap_changing, uffd_flags_t flags);
> -extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
> +				 uffd_flags_t flags);
> +extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
>  				     unsigned long dst_start,
> -				     unsigned long len,
> -				     atomic_t *mmap_changing);
> -extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
> -				     unsigned long len, atomic_t *mmap_changing,
> -				     uffd_flags_t flags);
> -extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> -				   unsigned long len, atomic_t *mmap_changing,
> -				   uffd_flags_t flags);
> -extern int mwriteprotect_range(struct mm_struct *dst_mm,
> -			       unsigned long start, unsigned long len,
> -			       bool enable_wp, atomic_t *mmap_changing);
> +				     unsigned long len);
> +extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
> +				     unsigned long len, uffd_flags_t flags);
> +extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
> +				   unsigned long len, uffd_flags_t flags);
> +extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
> +			       unsigned long len, bool enable_wp);
>  extern long uffd_wp_range(struct vm_area_struct *vma,
>  			  unsigned long start, unsigned long len, bool enable_wp);
>  
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index e3a91871462a..6e2ca04ab04d 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
>   * called with mmap_lock held, it will release mmap_lock before returning.
>   */
>  static __always_inline ssize_t mfill_atomic_hugetlb(
> +					      struct userfaultfd_ctx *ctx,
>  					      struct vm_area_struct *dst_vma,
>  					      unsigned long dst_start,
>  					      unsigned long src_start,
>  					      unsigned long len,
> -					      atomic_t *mmap_changing,
>  					      uffd_flags_t flags)
>  {
>  	struct mm_struct *dst_mm = dst_vma->vm_mm;
> @@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	 * feature is not supported.
>  	 */
>  	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> +		up_read(&ctx->map_changing_lock);
>  		mmap_read_unlock(dst_mm);
>  		return -EINVAL;
>  	}
> @@ -463,6 +464,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  		cond_resched();
>  
>  		if (unlikely(err == -ENOENT)) {
> +			up_read(&ctx->map_changing_lock);
>  			mmap_read_unlock(dst_mm);
>  			BUG_ON(!folio);
>  
> @@ -473,12 +475,13 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  				goto out;
>  			}
>  			mmap_read_lock(dst_mm);
> +			down_read(&ctx->map_changing_lock);
>  			/*
>  			 * If memory mappings are changing because of non-cooperative
>  			 * operation (e.g. mremap) running in parallel, bail out and
>  			 * request the user to retry later
>  			 */
> -			if (mmap_changing && atomic_read(mmap_changing)) {
> +			if (atomic_read(ctx->mmap_changing)) {
>  				err = -EAGAIN;
>  				break;
>  			}
> @@ -501,6 +504,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  	}
>  
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  out:
>  	if (folio)
> @@ -512,11 +516,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>  }
>  #else /* !CONFIG_HUGETLB_PAGE */
>  /* fail at build time if gcc attempts to use this */
> -extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
> +extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
> +				    struct vm_area_struct *dst_vma,
>  				    unsigned long dst_start,
>  				    unsigned long src_start,
>  				    unsigned long len,
> -				    atomic_t *mmap_changing,
>  				    uffd_flags_t flags);
>  #endif /* CONFIG_HUGETLB_PAGE */
>  
> @@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
>  	return err;
>  }
>  
> -static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> +static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  					    unsigned long dst_start,
>  					    unsigned long src_start,
>  					    unsigned long len,
> -					    atomic_t *mmap_changing,
>  					    uffd_flags_t flags)
>  {
> +	struct mm_struct *dst_mm = ctx->mm;
>  	struct vm_area_struct *dst_vma;
>  	ssize_t err;
>  	pmd_t *dst_pmd;
> @@ -600,8 +604,9 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	 * operation (e.g. mremap) running in parallel, bail out and
>  	 * request the user to retry later
>  	 */
> +	down_read(&ctx->map_changing_lock);
>  	err = -EAGAIN;
> -	if (mmap_changing && atomic_read(mmap_changing))
> +	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
>  	/*
> @@ -633,8 +638,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	 * If this is a HUGETLB vma, pass off to appropriate routine
>  	 */
>  	if (is_vm_hugetlb_page(dst_vma))
> -		return  mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
> -					     len, mmap_changing, flags);
> +		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> +					     src_start, len, flags);
>  
>  	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
>  		goto out_unlock;
> @@ -693,6 +698,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  		if (unlikely(err == -ENOENT)) {
>  			void *kaddr;
>  
> +			up_read(&ctx->map_changing_lock);
>  			mmap_read_unlock(dst_mm);
>  			BUG_ON(!folio);
>  
> @@ -723,6 +729,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	}
>  
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  out:
>  	if (folio)
> @@ -733,34 +740,33 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
>  	return copied ? copied : err;
>  }
>  
> -ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
> +ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
>  			  unsigned long src_start, unsigned long len,
> -			  atomic_t *mmap_changing, uffd_flags_t flags)
> +			  uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
> +	return mfill_atomic(ctx, dst_start, src_start, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
>  }
>  
> -ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
> -			      unsigned long len, atomic_t *mmap_changing)
> +ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
> +			      unsigned long start,
> +			      unsigned long len)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
>  }
>  
> -ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
> -			      unsigned long len, atomic_t *mmap_changing,
> -			      uffd_flags_t flags)
> +ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
> +			      unsigned long len, uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
>  }
>  
> -ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
> -			    unsigned long len, atomic_t *mmap_changing,
> -			    uffd_flags_t flags)
> +ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
> +			    unsigned long len, uffd_flags_t flags)
>  {
> -	return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
> +	return mfill_atomic(ctx, start, 0, len,
>  			    uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
>  }
>  
> @@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
>  	return ret;
>  }
>  
> -int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
> -			unsigned long len, bool enable_wp,
> -			atomic_t *mmap_changing)
> +int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
> +			unsigned long len, bool enable_wp)
>  {
> +	struct mm_struct *dst_mm = ctx->mm;
>  	unsigned long end = start + len;
>  	unsigned long _start, _end;
>  	struct vm_area_struct *dst_vma;
> @@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
>  	 * operation (e.g. mremap) running in parallel, bail out and
>  	 * request the user to retry later
>  	 */
> +	down_read(&ctx->map_changing_lock);
>  	err = -EAGAIN;
> -	if (mmap_changing && atomic_read(mmap_changing))
> +	if (atomic_read(&ctx->mmap_changing))
>  		goto out_unlock;
>  
>  	err = -ENOENT;
> @@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
>  		err = 0;
>  	}
>  out_unlock:
> +	up_read(&ctx->map_changing_lock);
>  	mmap_read_unlock(dst_mm);
>  	return err;
>  }
> -- 
> 2.43.0.429.g432eaa2c6b-goog
> 

-- 
Sincerely yours,
Mike.




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux