From: Nadav Amit <namit@xxxxxxxxxx> Using a PTE on x86 with cleared access-bit (aka young-bit) takes ~600 cycles more than when the access bit is set. At the same time, setting the access-bit for memory that is not used (e.g., prefetched) can introduce greater overheads, as the prefetched memory is reclaimed later than it should be. Userfaultfd currently does not set the access-bit (excluding the huge-pages case). Arguably, it is best to let the user control whether the access bit should be set or not. The expected use is to request userfaultfd to set the access-bit when the copy/wp operation is done to resolve a page-fault, and not to set the access-bit when the memory is prefetched. Introduce UFFDIO_[op]_ACCESS_LIKELY to enable userspace to request the young bit to be set. Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Axel Rasmussen <axelrasmussen@xxxxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Mike Rapoport <rppt@xxxxxxxxxxxxx> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx> --- fs/userfaultfd.c | 25 ++++++++++++++++++++----- include/linux/userfaultfd_k.h | 1 + include/uapi/linux/userfaultfd.h | 20 +++++++++++++++++++- mm/userfaultfd.c | 16 ++++++++++++---- 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index a44e46f8249f..abf176bd0349 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1726,12 +1726,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP| + UFFDIO_COPY_MODE_ACCESS_LIKELY)) goto out; mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP; uffd_flags = mode_wp ? UFFD_FLAGS_WP : UFFD_FLAGS_NONE; + if (uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY) + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, @@ -1783,9 +1786,13 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (ret) goto out; ret = -EINVAL; - if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) + if (uffdio_zeropage.mode & ~(UFFDIO_ZEROPAGE_MODE_DONTWAKE| + UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY)) goto out; + if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY) + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + if (mmget_not_zero(ctx->mm)) { ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, uffdio_zeropage.range.len, @@ -1835,7 +1842,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return ret; if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | - UFFDIO_WRITEPROTECT_MODE_WP)) + UFFDIO_WRITEPROTECT_MODE_WP | + UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY)) return -EINVAL; mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; @@ -1845,6 +1853,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return -EINVAL; uffd_flags = mode_wp ? UFFD_FLAGS_WP : UFFD_FLAGS_NONE; + if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY) + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; if (mmget_not_zero(ctx->mm)) { ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, @@ -1872,6 +1882,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct userfaultfd_wake_range range; + uffd_flags_t uffd_flags = UFFD_FLAGS_NONE; user_uffdio_continue = (struct uffdio_continue __user *)arg; @@ -1896,13 +1907,17 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) uffdio_continue.range.start) { goto out; } - if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE| + UFFDIO_CONTINUE_MODE_ACCESS_LIKELY)) goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_ACCESS_LIKELY) + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + if (mmget_not_zero(ctx->mm)) { ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, uffdio_continue.range.len, - &ctx->mmap_changing, 0); + &ctx->mmap_changing, uffd_flags); mmput(ctx->mm); } else { return -ESRCH; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index d5b3dff48a87..af268b2c2b27 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -59,6 +59,7 @@ typedef unsigned int __bitwise uffd_flags_t; #define UFFD_FLAGS_NONE ((__force uffd_flags_t)0) #define UFFD_FLAGS_WP ((__force uffd_flags_t)BIT(0)) +#define UFFD_FLAGS_ACCESS_LIKELY ((__force uffd_flags_t)BIT(1)) extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 005e5e306266..ff7150c878bb 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -38,7 +38,8 @@ UFFD_FEATURE_MINOR_HUGETLBFS | \ UFFD_FEATURE_MINOR_SHMEM | \ UFFD_FEATURE_EXACT_ADDRESS | \ - UFFD_FEATURE_WP_HUGETLBFS_SHMEM) + UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ + UFFD_FEATURE_ACCESS_HINTS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -203,6 +204,9 @@ struct uffdio_api { * * UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd * write-protection mode is supported on both shmem and hugetlbfs. + * + * UFFD_FEATURE_ACCESS_HINTS indicates that the ioctl operations + * support the UFFDIO_*_MODE_ACCESS_LIKELY hints. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -217,6 +221,7 @@ struct uffdio_api { #define UFFD_FEATURE_MINOR_SHMEM (1<<10) #define UFFD_FEATURE_EXACT_ADDRESS (1<<11) #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) +#define UFFD_FEATURE_ACCESS_HINTS (1<<13) __u64 features; __u64 ioctls; @@ -251,8 +256,14 @@ struct uffdio_copy { * the fly. UFFDIO_COPY_MODE_WP is available only if the * write protected ioctl is implemented for the range * according to the uffdio_register.ioctls. + * + * UFFDIO_COPY_MODE_ACCESS_LIKELY provides a hint to the kernel that the + * page is likely to be access in the near future. Providing the hint + * properly can improve performance. + * */ #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) +#define UFFDIO_COPY_MODE_ACCESS_LIKELY ((__u64)1<<2) __u64 mode; /* @@ -265,6 +276,7 @@ struct uffdio_copy { struct uffdio_zeropage { struct uffdio_range range; #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<1) __u64 mode; /* @@ -284,6 +296,10 @@ struct uffdio_writeprotect { * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up * any wait thread after the operation succeeds. * + * UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY provides a hint to the kernel + * that the page is likely to be access in the near future. Providing + * the hint properly can improve performance. + * * NOTE: Write protecting a region (WP=1) is unrelated to page faults, * therefore DONTWAKE flag is meaningless with WP=1. Removing write * protection (WP=0) in response to a page fault wakes the faulting @@ -291,12 +307,14 @@ struct uffdio_writeprotect { */ #define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) +#define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY ((__u64)1<<2) __u64 mode; }; struct uffdio_continue { struct uffdio_range range; #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_CONTINUE_MODE_ACCESS_LIKELY ((__u64)1<<1) __u64 mode; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 734de6aa0b8e..5051b9028722 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -92,6 +92,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, */ _dst_pte = pte_wrprotect(_dst_pte); + if (uffd_flags & UFFD_FLAGS_ACCESS_LIKELY) + _dst_pte = pte_mkyoung(_dst_pte); + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (vma_is_shmem(dst_vma)) { @@ -202,7 +205,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, static int mfill_zeropage_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, - unsigned long dst_addr) + unsigned long dst_addr, + uffd_flags_t uffd_flags) { pte_t _dst_pte, *dst_pte; spinlock_t *ptl; @@ -225,6 +229,10 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_unlock; + + if (uffd_flags & UFFD_FLAGS_ACCESS_LIKELY) + _dst_pte = pte_mkyoung(_dst_pte); + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); @@ -498,7 +506,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, uffd_flags); else err = mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); + dst_vma, dst_addr, uffd_flags); } else { err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, @@ -692,7 +700,7 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, uffd_flags_t uffd_flags) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, - mmap_changing, 0); + mmap_changing, uffd_flags); } ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, @@ -700,7 +708,7 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, uffd_flags_t uffd_flags) { return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, - mmap_changing, 0); + mmap_changing, uffd_flags); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, -- 2.25.1