From: Nadav Amit <namit@xxxxxxxxxx> Introduce write-likely hints for uffd. These hints would be used in a future patch to decide whether to attempt to map pages in the page-table or whether to only mark them logically as writable. This allows userspace to determine whether a page would be accessed faster or whether removal of the page would be possible, potentially, without writeback and TLB flush. Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Axel Rasmussen <axelrasmussen@xxxxxxxxxx> Cc: Peter Xu <peterx@xxxxxxxxxx> Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Mike Rapoport <rppt@xxxxxxxxxxxxx> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx> --- fs/userfaultfd.c | 32 ++++++++++++++++++++++++-------- include/linux/userfaultfd_k.h | 1 + include/uapi/linux/userfaultfd.h | 13 ++++++++++++- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8d8792b27c53..3027d228550a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1709,7 +1709,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP| - UFFDIO_COPY_MODE_ACCESS_LIKELY)) + UFFDIO_COPY_MODE_ACCESS_LIKELY| + UFFDIO_COPY_MODE_WRITE_LIKELY)) goto out; mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP; @@ -1719,8 +1720,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) { if (uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY) uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + if (uffdio_copy.mode & UFFDIO_COPY_MODE_WRITE_LIKELY) + uffd_flags |= UFFD_FLAGS_WRITE_LIKELY; } else { - uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY | + UFFD_FLAGS_WRITE_LIKELY; } if (mmget_not_zero(ctx->mm)) { @@ -1774,14 +1778,18 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out; ret = -EINVAL; if (uffdio_zeropage.mode & ~(UFFDIO_ZEROPAGE_MODE_DONTWAKE| - UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY)) + UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY| + UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY)) goto out; if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) { if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY) uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + if (uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY) + uffd_flags |= UFFD_FLAGS_WRITE_LIKELY; } else { - uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY | + UFFD_FLAGS_WRITE_LIKELY; } if (mmget_not_zero(ctx->mm)) { @@ -1834,7 +1842,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | UFFDIO_WRITEPROTECT_MODE_WP | - UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY)) + UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY | + UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY)) return -EINVAL; mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; @@ -1847,8 +1856,11 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) { if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY) uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + if (uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY) + uffd_flags |= UFFD_FLAGS_WRITE_LIKELY; } else { - uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY | + UFFD_FLAGS_WRITE_LIKELY; } if (mmget_not_zero(ctx->mm)) { @@ -1903,14 +1915,18 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; } if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE| - UFFDIO_CONTINUE_MODE_ACCESS_LIKELY)) + UFFDIO_CONTINUE_MODE_ACCESS_LIKELY| + UFFDIO_CONTINUE_MODE_WRITE_LIKELY)) goto out; if (ctx->features & UFFD_FEATURE_ACCESS_HINTS) { if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_ACCESS_LIKELY) uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WRITE_LIKELY) + uffd_flags |= UFFD_FLAGS_WRITE_LIKELY; } else { - uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY; + uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY | + UFFD_FLAGS_WRITE_LIKELY; } if (mmget_not_zero(ctx->mm)) { diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index b326798b5677..4968c86938b2 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -60,6 +60,7 @@ typedef unsigned int __bitwise uffd_flags_t; #define UFFD_FLAGS_NONE ((__force uffd_flags_t)0) #define UFFD_FLAGS_WP ((__force uffd_flags_t)BIT(0)) #define UFFD_FLAGS_ACCESS_LIKELY ((__force uffd_flags_t)BIT(1)) +#define UFFD_FLAGS_WRITE_LIKELY ((__force uffd_flags_t)BIT(2)) extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 02e0c1f56939..f52cbe4c9c44 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -202,7 +202,7 @@ struct uffdio_api { * write-protection mode is supported on both shmem and hugetlbfs. * * UFFD_FEATURE_ACCESS_HINTS indicates that the ioctl operations - * support the UFFDIO_*_MODE_ACCESS_LIKELY hints. + * support the UFFDIO_*_MODE_[ACCESS|WRITE]_LIKELY hints. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -257,9 +257,13 @@ struct uffdio_copy { * page is likely to be access in the near future. Providing the hint * properly can improve performance. * + * UFFDIO_COPY_MODE_WRITE_LIKELY provides a hint to the kernel that the + * page is likely to be written in the near future. Providing the hint + * properly can improve performance. */ #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) #define UFFDIO_COPY_MODE_ACCESS_LIKELY ((__u64)1<<2) +#define UFFDIO_COPY_MODE_WRITE_LIKELY ((__u64)1<<3) __u64 mode; /* @@ -273,6 +277,7 @@ struct uffdio_zeropage { struct uffdio_range range; #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) #define UFFDIO_ZEROPAGE_MODE_ACCESS_LIKELY ((__u64)1<<1) +#define UFFDIO_ZEROPAGE_MODE_WRITE_LIKELY ((__u64)1<<2) __u64 mode; /* @@ -296,6 +301,10 @@ struct uffdio_writeprotect { * that the page is likely to be access in the near future. Providing * the hint properly can improve performance. * + * UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY: provides a hint to the kernel + * that the page is likely to be written in the near future. Providing + * the hint properly can improve performance. + * * NOTE: Write protecting a region (WP=1) is unrelated to page faults, * therefore DONTWAKE flag is meaningless with WP=1. Removing write * protection (WP=0) in response to a page fault wakes the faulting @@ -304,6 +313,7 @@ struct uffdio_writeprotect { #define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) #define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY ((__u64)1<<2) +#define UFFDIO_WRITEPROTECT_MODE_WRITE_LIKELY ((__u64)1<<3) __u64 mode; }; @@ -311,6 +321,7 @@ struct uffdio_continue { struct uffdio_range range; #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) #define UFFDIO_CONTINUE_MODE_ACCESS_LIKELY ((__u64)1<<1) +#define UFFDIO_CONTINUE_MODE_WRITE_LIKELY ((__u64)1<<2) __u64 mode; /* -- 2.25.1