On Monday, 15 November 2021 6:55:05 PM AEDT Peter Xu wrote: [...] > diff --git a/mm/memory.c b/mm/memory.c > index d5966d9e24c3..e8557d43a87d 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -3452,6 +3452,43 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) > return 0; > } > > +static vm_fault_t pte_marker_clear(struct vm_fault *vmf) > +{ > + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, > + vmf->address, &vmf->ptl); > + /* > + * Be careful so that we will only recover a special uffd-wp pte into a > + * none pte. Otherwise it means the pte could have changed, so retry. > + */ > + if (is_pte_marker(*vmf->pte)) > + pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); > + pte_unmap_unlock(vmf->pte, vmf->ptl); > + return 0; > +} > + > +/* > + * This is actually a page-missing access, but with uffd-wp special pte > + * installed. It means this pte was wr-protected before being unmapped. > + */ > +static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) > +{ > + /* Careful! vmf->pte unmapped after return */ > + if (!pte_unmap_same(vmf)) Hasn't vmf->pte already been unmapped by do_swap_page() by the time we get here? > + return 0; > + > + /* > + * Just in case there're leftover special ptes even after the region > + * got unregistered - we can simply clear them. We can also do that > + * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp > + * ranges, but it should be more efficient to be done lazily here. > + */ > + if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma))) > + return pte_marker_clear(vmf); > + > + /* do_fault() can handle pte markers too like none pte */ > + return do_fault(vmf); > +} > + > static vm_fault_t handle_pte_marker(struct vm_fault *vmf) > { > swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); > @@ -3465,8 +3502,11 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) > if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) > return VM_FAULT_SIGBUS; > > - /* TODO: handle pte markers */ > - return 0; > + if (marker & PTE_MARKER_UFFD_WP) Can we make this check `marker == PTE_MARKER_UFFD_WP`? There is currently only one user of pte markers, and from what I can tell pte_marker_handle_uffd_wp() wouldn't do the correct thing if other users were added because it could clear non-uffd-wp markers. I don't think it's worth making it do the right thing now, but a comment noting that would be helpful. > + return pte_marker_handle_uffd_wp(vmf); > + > + /* This is an unknown pte marker */ > + return VM_FAULT_SIGBUS; > } > > /* > @@ -3968,6 +4008,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) > void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > { > struct vm_area_struct *vma = vmf->vma; > + bool uffd_wp = is_pte_marker_uffd_wp(vmf->orig_pte); > bool write = vmf->flags & FAULT_FLAG_WRITE; > bool prefault = vmf->address != addr; > pte_t entry; > @@ -3982,6 +4023,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) > > if (write) > entry = maybe_mkwrite(pte_mkdirty(entry), vma); > + if (unlikely(uffd_wp)) > + entry = pte_mkuffd_wp(pte_wrprotect(entry)); > /* copy-on-write page */ > if (write && !(vma->vm_flags & VM_SHARED)) { > inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); > @@ -4155,9 +4198,21 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) > return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); > } > > +/* Return true if we should do read fault-around, false otherwise */ > +static inline bool should_fault_around(struct vm_fault *vmf) > +{ > + /* No ->map_pages? No way to fault around... */ > + if (!vmf->vma->vm_ops->map_pages) > + return false; > + > + if (uffd_disable_fault_around(vmf->vma)) > + return false; > + > + return fault_around_bytes >> PAGE_SHIFT > 1; > +} > + > static vm_fault_t do_read_fault(struct vm_fault *vmf) > { > - struct vm_area_struct *vma = vmf->vma; > vm_fault_t ret = 0; > > /* > @@ -4165,12 +4220,10 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) > * if page by the offset is not ready to be mapped (cold cache or > * something). > */ > - if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { > - if (likely(!userfaultfd_minor(vmf->vma))) { > - ret = do_fault_around(vmf); > - if (ret) > - return ret; > - } > + if (should_fault_around(vmf)) { > + ret = do_fault_around(vmf); > + if (ret) > + return ret; > } > > ret = __do_fault(vmf); >