On Sat, Feb 26, 2022 at 02:26:55AM +0000, Nadav Amit wrote: > From: Nadav Amit <namit@xxxxxxxxxx> > > Userfaultfd is supposed to provide the full address (i.e., unmasked) of > the faulting access back to userspace. However, that is not the case for > quite some time. > > Even running "userfaultfd_demo" from the userfaultfd man page provides > the wrong output (and contradicts the man page). Notice that > "UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) > and not the first read address (0x7fc5e30b300f). > > Address returned by mmap() = 0x7fc5e30b3000 > > fault_handler_thread(): > poll() returns: nready = 1; POLLIN = 1; POLLERR = 0 > UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000 > (uffdio_copy.copy returned 4096) > Read address 0x7fc5e30b300f in main(): A > Read address 0x7fc5e30b340f in main(): A > Read address 0x7fc5e30b380f in main(): A > Read address 0x7fc5e30b3c0f in main(): A > > The exact address is useful for various reasons and specifically for > prefetching decisions. If it is known that the memory is populated by > certain objects whose size is not page-aligned, then based on the > faulting address, the uffd-monitor can decide whether to prefetch and > prefault the adjacent page. > > This bug has been for quite some time in the kernel: since commit > 1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address") > vmf->virtual_address"), which dates back to 2016. A concern has been > raised that existing userspace application might rely on the old/wrong > behavior in which the address is masked. Therefore, it was suggested to > provide the masked address unless the user explicitly asks for the exact > address. > > Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct > userfaultfd to provide the exact address. Add a new "real_address" field > to vmf to hold the unmasked address. Provide the address to userspace > accordingly. > > Initialize real_address in various code-paths to be consistent with > address, even when it is not used, to be on the safe side. > > Acked-by: Peter Xu <peterx@xxxxxxxxxx> > Reviewed-by: David Hildenbrand <david@xxxxxxxxxx> > Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> > Cc: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> > Cc: Jan Kara <jack@xxxxxxx> > Signed-off-by: Nadav Amit <namit@xxxxxxxxxx> Acked-by: Mike Rapoport <rppt@xxxxxxxxxxxxx> > > --- > > v2->v3: > * Initialize real_address on all code paths [Jan] > > v1->v2: > * Add uffd feature to selectively enable [David, Andrea] > --- > fs/userfaultfd.c | 5 ++++- > include/linux/mm.h | 3 ++- > include/uapi/linux/userfaultfd.h | 8 +++++++- > mm/hugetlb.c | 6 ++++-- > mm/memory.c | 1 + > mm/swapfile.c | 1 + > 6 files changed, 19 insertions(+), 5 deletions(-) > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > index e26b10132d47..826927026fe7 100644 > --- a/fs/userfaultfd.c > +++ b/fs/userfaultfd.c > @@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, > struct uffd_msg msg; > msg_init(&msg); > msg.event = UFFD_EVENT_PAGEFAULT; > + > + if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) > + address &= PAGE_MASK; > msg.arg.pagefault.address = address; > /* > * These flags indicate why the userfault occurred: > @@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) > > init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); > uwq.wq.private = current; > - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, > + uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, > ctx->features); > uwq.ctx = ctx; > uwq.waken = false; > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 213cc569b192..27df0ca0a36a 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -478,7 +478,8 @@ struct vm_fault { > struct vm_area_struct *vma; /* Target VMA */ > gfp_t gfp_mask; /* gfp mask to be used for allocations */ > pgoff_t pgoff; /* Logical page offset based on vma */ > - unsigned long address; /* Faulting virtual address */ > + unsigned long address; /* Faulting virtual address - masked */ > + unsigned long real_address; /* Faulting virtual address - unmaked */ > }; > enum fault_flag flags; /* FAULT_FLAG_xxx flags > * XXX: should really be 'const' */ > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > index 05b31d60acf6..ef739054cb1c 100644 > --- a/include/uapi/linux/userfaultfd.h > +++ b/include/uapi/linux/userfaultfd.h > @@ -32,7 +32,8 @@ > UFFD_FEATURE_SIGBUS | \ > UFFD_FEATURE_THREAD_ID | \ > UFFD_FEATURE_MINOR_HUGETLBFS | \ > - UFFD_FEATURE_MINOR_SHMEM) > + UFFD_FEATURE_MINOR_SHMEM | \ > + UFFD_FEATURE_EXACT_ADDRESS) > #define UFFD_API_IOCTLS \ > ((__u64)1 << _UFFDIO_REGISTER | \ > (__u64)1 << _UFFDIO_UNREGISTER | \ > @@ -189,6 +190,10 @@ struct uffdio_api { > * > * UFFD_FEATURE_MINOR_SHMEM indicates the same support as > * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead. > + * > + * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page > + * faults would be provided and the offset within the page would not be > + * masked. > */ > #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) > #define UFFD_FEATURE_EVENT_FORK (1<<1) > @@ -201,6 +206,7 @@ struct uffdio_api { > #define UFFD_FEATURE_THREAD_ID (1<<8) > #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) > #define UFFD_FEATURE_MINOR_SHMEM (1<<10) > +#define UFFD_FEATURE_EXACT_ADDRESS (1<<11) > __u64 features; > > __u64 ioctls; > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 61895cc01d09..16017f90568b 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5342,6 +5342,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, > pgoff_t idx, > unsigned int flags, > unsigned long haddr, > + unsigned long addr, > unsigned long reason) > { > vm_fault_t ret; > @@ -5349,6 +5350,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, > struct vm_fault vmf = { > .vma = vma, > .address = haddr, > + .real_address = addr, > .flags = flags, > > /* > @@ -5417,7 +5419,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, > /* Check for page in userfault range */ > if (userfaultfd_missing(vma)) { > ret = hugetlb_handle_userfault(vma, mapping, idx, > - flags, haddr, > + flags, haddr, address, > VM_UFFD_MISSING); > goto out; > } > @@ -5481,7 +5483,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, > unlock_page(page); > put_page(page); > ret = hugetlb_handle_userfault(vma, mapping, idx, > - flags, haddr, > + flags, haddr, address, > VM_UFFD_MINOR); > goto out; > } > diff --git a/mm/memory.c b/mm/memory.c > index c125c4969913..aae53fde13d9 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -4622,6 +4622,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > struct vm_fault vmf = { > .vma = vma, > .address = address & PAGE_MASK, > + .real_address = address, > .flags = flags, > .pgoff = linear_page_index(vma, address), > .gfp_mask = __get_fault_gfp_mask(vma), > diff --git a/mm/swapfile.c b/mm/swapfile.c > index bf0df7aa7158..33c7abb16610 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, > struct vm_fault vmf = { > .vma = vma, > .address = addr, > + .real_address = addr, > .pmd = pmd, > }; > > -- > 2.25.1 > -- Sincerely yours, Mike.