Muhammad, Sorry, I probably can only review the non-interface part, and leave the interface/buffer handling, etc. review for others and real potential users of it.. On Thu, May 25, 2023 at 01:55:14PM +0500, Muhammad Usama Anjum wrote: > +static inline void make_uffd_wp_huge_pte(struct vm_area_struct *vma, > + unsigned long addr, pte_t *ptep, > + pte_t ptent) > +{ > + pte_t old_pte; > + > + if (!huge_pte_none(ptent)) { > + old_pte = huge_ptep_modify_prot_start(vma, addr, ptep); > + ptent = huge_pte_mkuffd_wp(old_pte); > + ptep_modify_prot_commit(vma, addr, ptep, old_pte, ptent); huge_ptep_modify_prot_start()? The other thing is what if it's a pte marker already? What if a hugetlb migration entry? Please check hugetlb_change_protection(). > + } else { > + set_huge_pte_at(vma->vm_mm, addr, ptep, > + make_pte_marker(PTE_MARKER_UFFD_WP)); > + } > +} > +#endif [...] > +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, > + unsigned long end, struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + unsigned long addr = end; > + pte_t *pte, *orig_pte; > + spinlock_t *ptl; > + bool is_written; > + int ret = 0; > + > + arch_enter_lazy_mmu_mode(); > + > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + ptl = pmd_trans_huge_lock(pmd, vma); > + if (ptl) { > + unsigned long n_pages = (end - start)/PAGE_SIZE; > + > + if (p->max_pages && n_pages > p->max_pages - p->found_pages) > + n_pages = p->max_pages - p->found_pages; > + > + is_written = !is_pmd_uffd_wp(*pmd); > + > + /* > + * Break huge page into small pages if the WP operation need to > + * be performed is on a portion of the huge page. > + */ > + if (is_written && IS_PM_SCAN_WP(p->flags) && > + n_pages < HPAGE_SIZE/PAGE_SIZE) { > + spin_unlock(ptl); > + > + split_huge_pmd(vma, pmd, start); > + goto process_smaller_pages; > + } > + > + if (IS_PM_SCAN_GET(p->flags)) > + ret = pagemap_scan_output(is_written, vma->vm_file, > + pmd_present(*pmd), > + is_swap_pmd(*pmd), > + p, start, n_pages); > + > + if (ret >= 0 && is_written && IS_PM_SCAN_WP(p->flags)) > + make_uffd_wp_pmd(vma, addr, pmd); > + > + if (IS_PM_SCAN_WP(p->flags)) > + flush_tlb_range(vma, start, end); > + > + spin_unlock(ptl); > + > + arch_leave_lazy_mmu_mode(); > + return ret; > + } > + > +process_smaller_pages: > + if (pmd_trans_unstable(pmd)) { > + arch_leave_lazy_mmu_mode(); > + return 0; I'm not sure whether this is right.. Shouldn't you return with -EAGAIN and let the user retry? Returning 0 means you'll move on with the next pmd afaict and ignoring this one. > + } > +#endif > + > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); Just a heads-up that this may start to fail at some point if Hugh's work will land earlier: https://lore.kernel.org/linux-mm/68a97fbe-5c1e-7ac6-72c-7b9c6290b370@xxxxxxxxxx/ > + for (addr = start; addr < end && !ret; pte++, addr += PAGE_SIZE) { > + is_written = !is_pte_uffd_wp(*pte); > + > + if (IS_PM_SCAN_GET(p->flags)) > + ret = pagemap_scan_output(is_written, vma->vm_file, > + pte_present(*pte), > + is_swap_pte(*pte), > + p, addr, 1); > + > + if (ret >= 0 && is_written && IS_PM_SCAN_WP(p->flags)) > + make_uffd_wp_pte(vma, addr, pte); > + } > + > + if (IS_PM_SCAN_WP(p->flags)) > + flush_tlb_range(vma, start, addr); > + > + pte_unmap_unlock(orig_pte, ptl); > + arch_leave_lazy_mmu_mode(); > + > + cond_resched(); > + return ret; > +} > + > +#ifdef CONFIG_HUGETLB_PAGE > +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, > + unsigned long start, unsigned long end, > + struct mm_walk *walk) > +{ > + unsigned long n_pages = (end - start)/PAGE_SIZE; > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + struct hstate *h = hstate_vma(vma); > + spinlock_t *ptl; > + bool is_written; > + int ret = 0; > + pte_t pte; > + > + arch_enter_lazy_mmu_mode(); This _seems_ to be not needed for hugetlb entries. > + > + if (p->max_pages && n_pages > p->max_pages - p->found_pages) > + n_pages = p->max_pages - p->found_pages; > + > + if (IS_PM_SCAN_WP(p->flags)) { > + i_mmap_lock_write(vma->vm_file->f_mapping); > + ptl = huge_pte_lock(h, vma->vm_mm, ptep); > + } > + > + pte = huge_ptep_get(ptep); > + is_written = !is_huge_pte_uffd_wp(pte); > + > + /* > + * Partial hugetlb page clear isn't supported > + */ > + if (is_written && IS_PM_SCAN_WP(p->flags) && > + n_pages < HPAGE_SIZE/PAGE_SIZE) { > + ret = -EPERM; > + goto unlock_and_return; > + } > + > + if (IS_PM_SCAN_GET(p->flags)) { > + ret = pagemap_scan_output(is_written, vma->vm_file, > + pte_present(pte), is_swap_pte(pte), > + p, start, n_pages); > + if (ret < 0) > + goto unlock_and_return; > + } > + > + if (is_written && IS_PM_SCAN_WP(p->flags)) { > + make_uffd_wp_huge_pte(vma, start, ptep, pte); > + flush_hugetlb_tlb_range(vma, start, end); > + } > + > +unlock_and_return: > + if (IS_PM_SCAN_WP(p->flags)) { > + spin_unlock(ptl); > + i_mmap_unlock_write(vma->vm_file->f_mapping); > + } > + > + arch_leave_lazy_mmu_mode(); Same here. > + > + return ret; > +} [...] -- Peter Xu