On Thu, Jul 27, 2023 at 02:36:34PM +0500, Muhammad Usama Anjum wrote: <snip> > + > +static void pagemap_scan_backout_range(struct pagemap_scan_private *p, > + unsigned long addr, unsigned long end, > + unsigned long end_addr) It hard to figure out what difference between end and end_addr. I would add a comment here. > +{ > + struct page_region *cur_buf = &p->cur_buf; > + > + if (cur_buf->start != addr) > + cur_buf->end = addr; > + else > + cur_buf->start = cur_buf->end = 0; > + > + p->end_addr = end_addr; > + p->found_pages -= (end - addr) / PAGE_SIZE; > +} > + > +static int pagemap_scan_output(unsigned long categories, > + struct pagemap_scan_private *p, > + unsigned long addr, unsigned long *end) > +{ > + unsigned long n_pages, total_pages; > + int ret = 0; > + > + if (!pagemap_scan_is_interesting_page(categories, p)) { > + *end = addr; > + return 0; > + } > + > + if (!p->vec_buf) > + return 0; > + > + categories &= p->arg.return_mask; > + > + n_pages = (*end - addr) / PAGE_SIZE; > + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || > + total_pages > p->arg.max_pages) { why do we need to use check_add_overflow here? > + size_t n_too_much = total_pages - p->arg.max_pages; it is unsafe to use total_pages if check_add_overflow returns non-zero. > + *end -= n_too_much * PAGE_SIZE; > + n_pages -= n_too_much; > + ret = -ENOSPC; > + } > + > + if (!pagemap_scan_push_range(categories, p, addr, *end)) { > + *end = addr; > + n_pages = 0; > + ret = -ENOSPC; > + } > + > + p->found_pages += n_pages; > + if (ret) > + p->end_addr = *end; > + > + return ret; > +} > + > +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, > + unsigned long end, struct mm_walk *walk) > +{ > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + unsigned long categories; > + spinlock_t *ptl; > + int ret = 0; > + > + ptl = pmd_trans_huge_lock(pmd, vma); > + if (!ptl) > + return -ENOENT; > + > + categories = p->cur_vma_category | pagemap_thp_category(*pmd); > + > + ret = pagemap_scan_output(categories, p, start, &end); > + if (start == end) > + goto out_unlock; > + > + if (~p->arg.flags & PM_SCAN_WP_MATCHING) > + goto out_unlock; > + if (~categories & PAGE_IS_WRITTEN) > + goto out_unlock; > + > + /* > + * Break huge page into small pages if the WP operation > + * need to be performed is on a portion of the huge page. > + */ > + if (end != start + HPAGE_SIZE) { > + spin_unlock(ptl); > + split_huge_pmd(vma, pmd, start); > + pagemap_scan_backout_range(p, start, end, 0); pagemap_scan_backout_range looks "weird"... imho, it makes the code harder for understanding. > + return -ENOENT; I think you need to add a comment that this ENOENT is a special case. > + } > + > + make_uffd_wp_pmd(vma, start, pmd); > + flush_tlb_range(vma, start, end); > +out_unlock: > + spin_unlock(ptl); > + return ret; > +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ > + return -ENOENT; > +#endif > +} > + > +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, > + unsigned long end, struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + pte_t *pte, *start_pte; > + unsigned long addr; > + bool flush = false; > + spinlock_t *ptl; > + int ret; > + > + arch_enter_lazy_mmu_mode(); > + > + ret = pagemap_scan_thp_entry(pmd, start, end, walk); > + if (ret != -ENOENT) { > + arch_leave_lazy_mmu_mode(); > + return ret; > + } > + > + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); > + if (!pte) { > + arch_leave_lazy_mmu_mode(); > + walk->action = ACTION_AGAIN; > + return 0; > + } > + > + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { > + unsigned long categories = p->cur_vma_category | > + pagemap_page_category(vma, addr, ptep_get(pte)); > + unsigned long next = addr + PAGE_SIZE; > + > + ret = pagemap_scan_output(categories, p, addr, &next); > + if (next == addr) { > + if (!ret) > + continue; > + break; > + } > + > + if (~p->arg.flags & PM_SCAN_WP_MATCHING) > + continue; > + if (~categories & PAGE_IS_WRITTEN) > + continue; > + > + make_uffd_wp_pte(vma, addr, pte); > + if (!flush) { > + start = addr; > + flush = true; > + } > + } > + > + if (flush) > + flush_tlb_range(vma, start, addr); > + > + pte_unmap_unlock(start_pte, ptl); > + arch_leave_lazy_mmu_mode(); > + > + cond_resched(); > + return ret; > +} > + > +#ifdef CONFIG_HUGETLB_PAGE > +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, > + unsigned long start, unsigned long end, > + struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + unsigned long categories; > + spinlock_t *ptl; > + int ret = 0; > + pte_t pte; > + > + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { > + /* Go the short route when not write-protecting pages. */ > + > + pte = huge_ptep_get(ptep); > + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); > + > + return pagemap_scan_output(categories, p, start, &end); > + } > + > + i_mmap_lock_write(vma->vm_file->f_mapping); > + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); > + > + pte = huge_ptep_get(ptep); > + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); > + > + ret = pagemap_scan_output(categories, p, start, &end); > + if (start == end) > + goto out_unlock; > + > + if (~categories & PAGE_IS_WRITTEN) > + goto out_unlock; > + > + if (end != start + HPAGE_SIZE) { > + /* Partial HugeTLB page WP isn't possible. */ > + pagemap_scan_backout_range(p, start, end, start); > + ret = -EINVAL; Why is it EINVAL in this case? > + goto out_unlock; > + } > + > + make_uffd_wp_huge_pte(vma, start, ptep, pte); > + flush_hugetlb_tlb_range(vma, start, end); > + > +out_unlock: > + spin_unlock(ptl); > + i_mmap_unlock_write(vma->vm_file->f_mapping); > + > + return ret; > +} > +#else > +#define pagemap_scan_hugetlb_entry NULL > +#endif > + > +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, > + int depth, struct mm_walk *walk) > +{ > + struct pagemap_scan_private *p = walk->private; > + struct vm_area_struct *vma = walk->vma; > + int ret, err; > + > + if (!vma) > + return 0; > + > + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); > + if (addr == end) > + return ret; > + > + if (~p->arg.flags & PM_SCAN_WP_MATCHING) > + return ret; > + > + err = uffd_wp_range(vma, addr, end - addr, true); > + if (err < 0) > + ret = err; > + > + return ret; > +} > + > +static const struct mm_walk_ops pagemap_scan_ops = { > + .test_walk = pagemap_scan_test_walk, > + .pmd_entry = pagemap_scan_pmd_entry, > + .pte_hole = pagemap_scan_pte_hole, > + .hugetlb_entry = pagemap_scan_hugetlb_entry, > +}; > + > +static int pagemap_scan_get_args(struct pm_scan_arg *arg, > + unsigned long uarg) > +{ > + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) > + return -EFAULT; > + > + if (arg->size != sizeof(struct pm_scan_arg)) > + return -EINVAL; > + > + /* Validate requested features */ > + if (arg->flags & ~PM_SCAN_FLAGS) > + return -EINVAL; > + if ((arg->category_inverted | arg->category_mask | > + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) > + return -EINVAL; > + > + arg->start = untagged_addr((unsigned long)arg->start); > + arg->end = untagged_addr((unsigned long)arg->end); > + arg->vec = untagged_addr((unsigned long)arg->vec); > + > + /* Validate memory pointers */ > + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) > + return -EINVAL; > + if (!access_ok((void __user *)arg->start, arg->end - arg->start)) > + return -EFAULT; > + if (!arg->vec && arg->vec_len) > + return -EFAULT; > + if (arg->vec && !access_ok((void __user *)arg->vec, > + arg->vec_len * sizeof(struct page_region))) > + return -EFAULT; > + > + /* Fixup default values */ > + arg->end = ALIGN(arg->end, PAGE_SIZE); > + if (!arg->max_pages) > + arg->max_pages = ULONG_MAX; > + > + return 0; > +} > + > +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, > + unsigned long uargl) > +{ > + struct pm_scan_arg __user *uarg = (void __user *)uargl; > + > + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) > + return -EFAULT; > + > + return 0; > +} > + > +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) > +{ > + if (!p->arg.vec_len) { > + /* > + * An arbitrary non-page-aligned sentinel value for > + * pagemap_scan_push_range(). > + */ > + p->cur_buf.start = p->cur_buf.end = ULLONG_MAX; > + if (p->arg.vec) > + p->vec_buf = ZERO_SIZE_PTR; > + return 0; > + } > + > + /* > + * Allocate a smaller buffer to get output from inside the page > + * walk functions and walk the range in PAGEMAP_WALK_SIZE chunks. > + * The last range is always stored in p.cur_buf to allow coalescing > + * consecutive ranges that have the same categories returned across > + * walk_page_range() calls. > + */ > + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, > + p->arg.vec_len - 1); > + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), > + GFP_KERNEL); > + if (!p->vec_buf) > + return -ENOMEM; > + > + p->vec_out = (struct page_region __user *)p->arg.vec; > + > + return 0; > +} > + > +static int pagemap_scan_flush_buffer(struct pagemap_scan_private *p) > +{ > + const struct page_region *buf = p->vec_buf; > + int n = (int)p->vec_buf_index; > + > + if (!n) > + return 0; > + > + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) > + return -EFAULT; > + > + p->arg.vec_len -= n; > + p->vec_out += n; > + > + p->vec_buf_index = 0; > + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len - 1); > + > + return n; > +} > + > +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) > +{ > + unsigned long walk_start, walk_end; > + struct mmu_notifier_range range; > + struct pagemap_scan_private p; > + size_t n_ranges_out = 0; > + int ret; > + > + memset(&p, 0, sizeof(p)); > + ret = pagemap_scan_get_args(&p.arg, uarg); > + if (ret) > + return ret; > + > + ret = pagemap_scan_init_bounce_buffer(&p); > + if (ret) > + return ret; > + > + /* Protection change for the range is going to happen. */ > + if (p.arg.flags & PM_SCAN_WP_MATCHING) { > + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, > + mm, p.arg.start, p.arg.end); > + mmu_notifier_invalidate_range_start(&range); > + } > + > + walk_start = walk_end = p.arg.start; > + for (; walk_end != p.arg.end; walk_start = walk_end) { > + int n_out; > + > + walk_end = min_t(unsigned long, > + (walk_start + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK, > + p.arg.end); > + if (fatal_signal_pending(current)) { ret = EINTR; break; } > + ret = mmap_read_lock_killable(mm); > + if (ret) > + break; > + ret = walk_page_range(mm, walk_start, walk_end, > + &pagemap_scan_ops, &p); > + mmap_read_unlock(mm); > + > + n_out = pagemap_scan_flush_buffer(&p); > + if (n_out < 0) > + ret = n_out; > + else > + n_ranges_out += n_out; > + > + if (ret) > + break; > + } > + > + if (p.cur_buf.start != p.cur_buf.end) { > + if (copy_to_user(p.vec_out, &p.cur_buf, sizeof(p.cur_buf))) > + ret = -EFAULT; > + else > + ++n_ranges_out; > + } > + > + /* ENOSPC signifies early stop (buffer full) from the walk. */ > + if (!ret || ret == -ENOSPC) > + ret = n_ranges_out; > + > + p.arg.walk_end = p.end_addr ? p.end_addr : walk_start; > + if (pagemap_scan_writeback_args(&p.arg, uarg)) > + ret = -EFAULT; > + > + if (p.arg.flags & PM_SCAN_WP_MATCHING) > + mmu_notifier_invalidate_range_end(&range); > + > + kfree(p.vec_buf); > + return ret; > +} > + > +static long do_pagemap_cmd(struct file *file, unsigned int cmd, > + unsigned long arg) > +{ > + struct mm_struct *mm = file->private_data; > + > + switch (cmd) { > + case PAGEMAP_SCAN: > + return do_pagemap_scan(mm, arg); > + > + default: > + return -EINVAL; > + } > +} > + > const struct file_operations proc_pagemap_operations = { > .llseek = mem_lseek, /* borrow this */ > .read = pagemap_read, > .open = pagemap_open, > .release = pagemap_release, > + .unlocked_ioctl = do_pagemap_cmd, > + .compat_ioctl = do_pagemap_cmd, > }; > #endif /* CONFIG_PROC_PAGE_MONITOR */ > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 0a393bc02f25b..8f8ff07453f22 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -259,6 +259,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, > unsigned long cp_flags); > > bool is_hugetlb_entry_migration(pte_t pte); > +bool is_hugetlb_entry_hwpoisoned(pte_t pte); > void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); > > #else /* !CONFIG_HUGETLB_PAGE */ > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h > index b7b56871029c5..1bb3c625c2381 100644 > --- a/include/uapi/linux/fs.h > +++ b/include/uapi/linux/fs.h > @@ -305,4 +305,62 @@ typedef int __bitwise __kernel_rwf_t; > #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ > RWF_APPEND) > > +/* Pagemap ioctl */ > +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) > + > +/* Bits are set in flags of the page_region and masks in pm_scan_args */ > +#define PAGE_IS_WPALLOWED (1 << 0) > +#define PAGE_IS_WRITTEN (1 << 1) > +#define PAGE_IS_FILE (1 << 2) > +#define PAGE_IS_PRESENT (1 << 3) > +#define PAGE_IS_SWAPPED (1 << 4) > +#define PAGE_IS_PFNZERO (1 << 5) > + > +/* > + * struct page_region - Page region with flags > + * @start: Start of the region > + * @end: End of the region (exclusive) > + * @categories: PAGE_IS_* category bitmask for the region > + */ > +struct page_region { > + __u64 start; > + __u64 end; > + __u64 categories; > +}; > + > +/* Flags for PAGEMAP_SCAN ioctl */ > +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ > +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ > + > +/* > + * struct pm_scan_arg - Pagemap ioctl argument > + * @size: Size of the structure > + * @flags: Flags for the IOCTL > + * @start: Starting address of the region > + * @end: Ending address of the region > + * @walk_end: Ending address of the visited memory is returned > + * (This helps if entire range hasn't been visited) > + * @vec: Address of page_region struct array for output > + * @vec_len: Length of the page_region struct array > + * @max_pages: Optional limit for number of returned pages (0 = disabled) > + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 > + * @category_mask: Skip pages for which any category doesn't match > + * @category_anyof_mask: Skip pages for which no category matches > + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned > + */ > +struct pm_scan_arg { > + __u64 size; > + __u64 flags; > + __u64 start; > + __u64 end; > + __u64 walk_end; > + __u64 vec; > + __u64 vec_len; > + __u64 max_pages; > + __u64 category_inverted; > + __u64 category_mask; > + __u64 category_anyof_mask; > + __u64 return_mask; > +}; > + > #endif /* _UAPI_LINUX_FS_H */ > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index a073e6ed8900b..3b07db0a4f2d9 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5008,7 +5008,7 @@ bool is_hugetlb_entry_migration(pte_t pte) > return false; > } > > -static bool is_hugetlb_entry_hwpoisoned(pte_t pte) > +bool is_hugetlb_entry_hwpoisoned(pte_t pte) > { > swp_entry_t swp; > > -- > 2.39.2 >