Re: [RFC PATCH v2 2/5] userfaultfd: introduce access-likely mode for copy/wp operations

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 20.06.22 01:34, Nadav Amit wrote:
> From: Nadav Amit <namit@xxxxxxxxxx>
> 
> Using a PTE on x86 with cleared access-bit (aka young-bit)
> takes ~600 cycles more than when the access bit is set. At the same
> time, setting the access-bit for memory that is not used (e.g.,
> prefetched) can introduce greater overheads, as the prefetched memory is
> reclaimed later than it should be.
> 
> Userfaultfd currently does not set the access-bit (excluding the
> huge-pages case). Arguably, it is best to let the user control whether
> the access bit should be set or not. The expected use is to request
> userfaultfd to set the access-bit when the copy/wp operation is done to
> resolve a page-fault, and not to set the access-bit when the memory is
> prefetched.
> 
> Introduce UFFDIO_COPY_MODE_ACCESS_LIKELY and
> UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY to enable userspace to request
> the young bit to be set. Set for UFFDIO_CONTINUE and UFFDIO_ZEROPAGE the
> bit unconditionally since the former is only used to resolve page-faults
> and the latter would not benefit from not setting the access-bit.
> 
> Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx>
> Cc: Hugh Dickins <hughd@xxxxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Cc: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
> Cc: Peter Xu <peterx@xxxxxxxxxx>
> Cc: David Hildenbrand <david@xxxxxxxxxx>
> Cc: Mike Rapoport <rppt@xxxxxxxxxxxxx>
> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
> ---
>  fs/userfaultfd.c                 | 23 ++++++++++++++++-------
>  include/linux/userfaultfd_k.h    |  1 +
>  include/uapi/linux/userfaultfd.h | 20 +++++++++++++++++++-
>  mm/userfaultfd.c                 | 18 ++++++++++++++++--
>  4 files changed, 52 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 5daafa54eb3f..35a8c4347c54 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1700,7 +1700,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>  	struct uffdio_copy uffdio_copy;
>  	struct uffdio_copy __user *user_uffdio_copy;
>  	struct userfaultfd_wake_range range;
> -	bool mode_wp;
> +	bool mode_wp, mode_access_likely;
>  	uffd_flags_t uffd_flags;
>  
>  	user_uffdio_copy = (struct uffdio_copy __user *) arg;
> @@ -1726,12 +1726,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>  	ret = -EINVAL;
>  	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
>  		goto out;
> -	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
> +	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP|
> +				 UFFDIO_COPY_MODE_ACCESS_LIKELY))
>  		goto out;
>  
>  	mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP;
> +	mode_access_likely = uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY;

I *relly* prefer just

if (uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY)
	uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY
[...]

>  
> -	uffd_flags = (mode_wp ? UFFD_FLAGS_WP : 0);
> +	uffd_flags = (mode_wp ? UFFD_FLAGS_WP : 0) |
> +		     (mode_access_likely ? UFFD_FLAGS_ACCESS_LIKELY : 0);
>  

Dito.

>  	if (mmget_not_zero(ctx->mm)) {
>  		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
> @@ -1871,6 +1877,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>  	struct uffdio_continue uffdio_continue;
>  	struct uffdio_continue __user *user_uffdio_continue;
>  	struct userfaultfd_wake_range range;
> +	uffd_flags_t uffd_flags;
>  
>  	user_uffdio_continue = (struct uffdio_continue __user *)arg;
>  
> @@ -1898,10 +1905,12 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
>  	if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
>  		goto out;
>  
> +	uffd_flags = UFFD_FLAGS_ACCESS_LIKELY;

Can we add a comment why that makes sense? I think I know why -- someone
is stuck waiting for that continue to happen :)

> +
>  	if (mmget_not_zero(ctx->mm)) {
>  		ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
>  				     uffdio_continue.range.len,
> -				     &ctx->mmap_changing, 0);
> +				     &ctx->mmap_changing, uffd_flags);
>  		mmput(ctx->mm);
>  	} else {
>  		return -ESRCH;
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 6331148023c1..e6ac165ec044 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -58,6 +58,7 @@ enum mcopy_atomic_mode {
>  typedef unsigned int __bitwise uffd_flags_t;
>  
>  #define UFFD_FLAGS_WP			((__force uffd_flags_t)BIT(0))
> +#define UFFD_FLAGS_ACCESS_LIKELY	((__force uffd_flags_t)BIT(1))
>  
>  extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
>  				    struct vm_area_struct *dst_vma,
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index 005e5e306266..d9c8ce9ba777 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -38,7 +38,8 @@
>  			   UFFD_FEATURE_MINOR_HUGETLBFS |	\
>  			   UFFD_FEATURE_MINOR_SHMEM |		\
>  			   UFFD_FEATURE_EXACT_ADDRESS |		\
> -			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM)
> +			   UFFD_FEATURE_WP_HUGETLBFS_SHMEM |	\
> +			   UFFD_FEATURE_ACCESS_HINTS)
>  #define UFFD_API_IOCTLS				\
>  	((__u64)1 << _UFFDIO_REGISTER |		\
>  	 (__u64)1 << _UFFDIO_UNREGISTER |	\
> @@ -203,6 +204,10 @@ struct uffdio_api {
>  	 *
>  	 * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
>  	 * write-protection mode is supported on both shmem and hugetlbfs.
> +	 *
> +	 * UFFD_FEATURE_ACCESS_HINTS indicates that the copy supports
> +	 * UFFDIO_COPY_MODE_ACCESS_LIKELY supports
> +	 * UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY.

I think that sentence doesn't make sense.

>  	 */
>  #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
>  #define UFFD_FEATURE_EVENT_FORK			(1<<1)
> @@ -217,6 +222,7 @@ struct uffdio_api {
>  #define UFFD_FEATURE_MINOR_SHMEM		(1<<10)
>  #define UFFD_FEATURE_EXACT_ADDRESS		(1<<11)
>  #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM		(1<<12)
> +#define UFFD_FEATURE_ACCESS_HINTS		(1<<13)
>  	__u64 features;
>  
>  	__u64 ioctls;
> @@ -260,6 +266,13 @@ struct uffdio_copy {
>  	 * copy_from_user will not read the last 8 bytes.
>  	 */
>  	__s64 copy;
> +	/*
> +	 * UFFDIO_COPY_MODE_ACCESS_LIKELY will set the mapped page as young.

Setting the page young is an implementation detail. Can you phrase it
more generically what the effect of that hint might be?

> +	 * This can reduce the time that the first access to the page takes.
> +	 * Yet, if set opportunistically to memory that is not used, it might
> +	 * extend the time before the unused memory pages are reclaimed.
> +	 */
> +#define UFFDIO_COPY_MODE_ACCESS_LIKELY		((__u64)1<<3)
>  };
>  
>  struct uffdio_zeropage {
> @@ -284,6 +297,10 @@ struct uffdio_writeprotect {
>   * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up
>   * any wait thread after the operation succeeds.
>   *
> + * UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY: set the flag to mark the modified
> + * memory as young, which can reduce the time that the first access
> + * to the page takes.

Dito.

> + *
>   * NOTE: Write protecting a region (WP=1) is unrelated to page faults,
>   * therefore DONTWAKE flag is meaningless with WP=1.  Removing write
>   * protection (WP=0) in response to a page fault wakes the faulting
> @@ -291,6 +308,7 @@ struct uffdio_writeprotect {
>   */
>  #define UFFDIO_WRITEPROTECT_MODE_WP		((__u64)1<<0)
>  #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE	((__u64)1<<1)
> +#define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY	((__u64)1<<2)
>  	__u64 mode;
>  };


[...]

> @@ -691,6 +699,9 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
>  		       unsigned long len, atomic_t *mmap_changing,
>  		       uffd_flags_t uffd_flags)
>  {
> +	/* There is no cost for setting the access bit of a zeropage */
> +	uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;
> +
>  	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
>  			      mmap_changing, 0);
>  }
> @@ -699,6 +710,9 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
>  		       unsigned long len, atomic_t *mmap_changing,
>  		       uffd_flags_t uffd_flags)
>  {
> +	/* The page is likely to be accessed */
> +	uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;

Shoouldn't that be set by the caller already?

> +
>  	return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
>  			      mmap_changing, 0);
>  }


In general, LGTM.


-- 
Thanks,

David / dhildenb





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux