From: Nadav Amit <namit@xxxxxxxxxx> Userfaultfd is supposed to provide the full address (i.e., unmasked) of the faulting access back to userspace. However, that is not the case for quite some time. Even running "userfaultfd_demo" from the userfaultfd man page provides the wrong output (and contradicts the man page). Notice that "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) and not the first read address (0x7fc5e30b300f). Address returned by mmap() = 0x7fc5e30b3000 fault_handler_thread(): poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 (uffdio_copy.copy returned 4096) Read address 0x7fc5e30b300f in main(): A Read address 0x7fc5e30b340f in main(): A Read address 0x7fc5e30b380f in main(): A Read address 0x7fc5e30b3c0f in main(): A The exact address is useful for various reasons and specifically for prefetching decisions. If it is known that the memory is populated by certain objects whose size is not page-aligned, then based on the faulting address, the uffd-monitor can decide whether to prefetch and prefault the adjacent page. This bug has been for quite some time in the kernel: since commit 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") vmf->virtual_address"), which dates back to 2016. A concern has been raised that existing userspace application might rely on the old/wrong behavior in which the address is masked. Therefore, it was suggested to provide the masked address unless the user explicitly asks for the exact address. Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct userfaultfd to provide the exact address. Add a new "real_address" field to vmf to hold the unmasked address. Provide the address to userspace accordingly. Initialize real_address in various code-paths to be consistent with address, even when it is not used, to be on the safe side. Acked-by: Peter Xu <peterx@xxxxxxxxxx> Reviewed-by: David Hildenbrand <david@xxxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> Cc: Jan Kara <jack@xxxxxxx> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx> --- v2->v3: * Initialize real_address on all code paths [Jan] v1->v2: * Add uffd feature to selectively enable [David, Andrea] --- fs/userfaultfd.c | 5 ++++- include/linux/mm.h | 3 ++- include/uapi/linux/userfaultfd.h | 8 +++++++- mm/hugetlb.c | 6 ++++-- mm/memory.c | 1 + mm/swapfile.c | 1 + 6 files changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e26b10132d47..826927026fe7 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, struct uffd_msg msg; msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; + + if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) + address &= PAGE_MASK; msg.arg.pagefault.address = address; /* * These flags indicate why the userfault occurred: @@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, + uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..27df0ca0a36a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -478,7 +478,8 @@ struct vm_fault { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ - unsigned long address; /* Faulting virtual address */ + unsigned long address; /* Faulting virtual address - masked */ + unsigned long real_address; /* Faulting virtual address - unmaked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 05b31d60acf6..ef739054cb1c 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -32,7 +32,8 @@ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ UFFD_FEATURE_MINOR_HUGETLBFS | \ - UFFD_FEATURE_MINOR_SHMEM) + UFFD_FEATURE_MINOR_SHMEM | \ + UFFD_FEATURE_EXACT_ADDRESS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -189,6 +190,10 @@ struct uffdio_api { * * UFFD_FEATURE_MINOR_SHMEM indicates the same support as * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. + * + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page + * faults would be provided and the offset within the page would not be + * masked. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -201,6 +206,7 @@ struct uffdio_api { #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) #define UFFD_FEATURE_MINOR_SHMEM (1<<10) +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) __u64 features; __u64 ioctls; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61895cc01d09..16017f90568b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5342,6 +5342,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, pgoff_t idx, unsigned int flags, unsigned long haddr, + unsigned long addr, unsigned long reason) { vm_fault_t ret; @@ -5349,6 +5350,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = haddr, + .real_address = addr, .flags = flags, /* @@ -5417,7 +5419,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, + flags, haddr, address, VM_UFFD_MISSING); goto out; } @@ -5481,7 +5483,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, unlock_page(page); put_page(page); ret = hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, + flags, haddr, address, VM_UFFD_MINOR); goto out; } diff --git a/mm/memory.c b/mm/memory.c index c125c4969913..aae53fde13d9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4622,6 +4622,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, + .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), diff --git a/mm/swapfile.c b/mm/swapfile.c index bf0df7aa7158..33c7abb16610 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct vm_fault vmf = { .vma = vma, .address = addr, + .real_address = addr, .pmd = pmd, }; -- 2.25.1