On 8/3/23 8:08 PM, Andrei Vagin wrote: > On Thu, Jul 27, 2023 at 02:36:34PM +0500, Muhammad Usama Anjum wrote: > > <snip> > >> + >> +static void pagemap_scan_backout_range(struct pagemap_scan_private *p, >> + unsigned long addr, unsigned long end, >> + unsigned long end_addr) > > It hard to figure out what difference between end and end_addr. I would > add a comment here. I'll fix these in next version. > >> +{ >> + struct page_region *cur_buf = &p->cur_buf; >> + >> + if (cur_buf->start != addr) >> + cur_buf->end = addr; >> + else >> + cur_buf->start = cur_buf->end = 0; >> + >> + p->end_addr = end_addr; >> + p->found_pages -= (end - addr) / PAGE_SIZE; >> +} >> + >> +static int pagemap_scan_output(unsigned long categories, >> + struct pagemap_scan_private *p, >> + unsigned long addr, unsigned long *end) >> +{ >> + unsigned long n_pages, total_pages; >> + int ret = 0; >> + >> + if (!pagemap_scan_is_interesting_page(categories, p)) { >> + *end = addr; >> + return 0; >> + } >> + >> + if (!p->vec_buf) >> + return 0; >> + >> + categories &= p->arg.return_mask; >> + >> + n_pages = (*end - addr) / PAGE_SIZE; >> + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || >> + total_pages > p->arg.max_pages) { > > why do we need to use check_add_overflow here? > >> + size_t n_too_much = total_pages - p->arg.max_pages; > > it is unsafe to use total_pages if check_add_overflow returns non-zero. > >> + *end -= n_too_much * PAGE_SIZE; >> + n_pages -= n_too_much; >> + ret = -ENOSPC; >> + } >> + >> + if (!pagemap_scan_push_range(categories, p, addr, *end)) { >> + *end = addr; >> + n_pages = 0; >> + ret = -ENOSPC; >> + } >> + >> + p->found_pages += n_pages; >> + if (ret) >> + p->end_addr = *end; >> + >> + return ret; >> +} >> + >> +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, >> + unsigned long end, struct mm_walk *walk) >> +{ >> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE >> + struct pagemap_scan_private *p = walk->private; >> + struct vm_area_struct *vma = walk->vma; >> + unsigned long categories; >> + spinlock_t *ptl; >> + int ret = 0; >> + >> + ptl = pmd_trans_huge_lock(pmd, vma); >> + if (!ptl) >> + return -ENOENT; >> + >> + categories = p->cur_vma_category | pagemap_thp_category(*pmd); >> + >> + ret = pagemap_scan_output(categories, p, start, &end); >> + if (start == end) >> + goto out_unlock; >> + >> + if (~p->arg.flags & PM_SCAN_WP_MATCHING) >> + goto out_unlock; >> + if (~categories & PAGE_IS_WRITTEN) >> + goto out_unlock; >> + >> + /* >> + * Break huge page into small pages if the WP operation >> + * need to be performed is on a portion of the huge page. >> + */ >> + if (end != start + HPAGE_SIZE) { >> + spin_unlock(ptl); >> + split_huge_pmd(vma, pmd, start); >> + pagemap_scan_backout_range(p, start, end, 0); > > pagemap_scan_backout_range looks "weird"... imho, it makes the code > harder for understanding. > >> + return -ENOENT; > > I think you need to add a comment that this ENOENT is a special case. > >> + } >> + >> + make_uffd_wp_pmd(vma, start, pmd); >> + flush_tlb_range(vma, start, end); >> +out_unlock: >> + spin_unlock(ptl); >> + return ret; >> +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ >> + return -ENOENT; >> +#endif >> +} >> + >> +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, >> + unsigned long end, struct mm_walk *walk) >> +{ >> + struct pagemap_scan_private *p = walk->private; >> + struct vm_area_struct *vma = walk->vma; >> + pte_t *pte, *start_pte; >> + unsigned long addr; >> + bool flush = false; >> + spinlock_t *ptl; >> + int ret; >> + >> + arch_enter_lazy_mmu_mode(); >> + >> + ret = pagemap_scan_thp_entry(pmd, start, end, walk); >> + if (ret != -ENOENT) { >> + arch_leave_lazy_mmu_mode(); >> + return ret; >> + } >> + >> + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); >> + if (!pte) { >> + arch_leave_lazy_mmu_mode(); >> + walk->action = ACTION_AGAIN; >> + return 0; >> + } >> + >> + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { >> + unsigned long categories = p->cur_vma_category | >> + pagemap_page_category(vma, addr, ptep_get(pte)); >> + unsigned long next = addr + PAGE_SIZE; >> + >> + ret = pagemap_scan_output(categories, p, addr, &next); >> + if (next == addr) { >> + if (!ret) >> + continue; >> + break; >> + } >> + >> + if (~p->arg.flags & PM_SCAN_WP_MATCHING) >> + continue; >> + if (~categories & PAGE_IS_WRITTEN) >> + continue; >> + >> + make_uffd_wp_pte(vma, addr, pte); >> + if (!flush) { >> + start = addr; >> + flush = true; >> + } >> + } >> + >> + if (flush) >> + flush_tlb_range(vma, start, addr); >> + >> + pte_unmap_unlock(start_pte, ptl); >> + arch_leave_lazy_mmu_mode(); >> + >> + cond_resched(); >> + return ret; >> +} >> + >> +#ifdef CONFIG_HUGETLB_PAGE >> +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, >> + unsigned long start, unsigned long end, >> + struct mm_walk *walk) >> +{ >> + struct pagemap_scan_private *p = walk->private; >> + struct vm_area_struct *vma = walk->vma; >> + unsigned long categories; >> + spinlock_t *ptl; >> + int ret = 0; >> + pte_t pte; >> + >> + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { >> + /* Go the short route when not write-protecting pages. */ >> + >> + pte = huge_ptep_get(ptep); >> + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); >> + >> + return pagemap_scan_output(categories, p, start, &end); >> + } >> + >> + i_mmap_lock_write(vma->vm_file->f_mapping); >> + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); >> + >> + pte = huge_ptep_get(ptep); >> + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); >> + >> + ret = pagemap_scan_output(categories, p, start, &end); >> + if (start == end) >> + goto out_unlock; >> + >> + if (~categories & PAGE_IS_WRITTEN) >> + goto out_unlock; >> + >> + if (end != start + HPAGE_SIZE) { >> + /* Partial HugeTLB page WP isn't possible. */ >> + pagemap_scan_backout_range(p, start, end, start); >> + ret = -EINVAL; > > Why is it EINVAL in this case? > >> + goto out_unlock; >> + } >> + >> + make_uffd_wp_huge_pte(vma, start, ptep, pte); >> + flush_hugetlb_tlb_range(vma, start, end); >> + >> +out_unlock: >> + spin_unlock(ptl); >> + i_mmap_unlock_write(vma->vm_file->f_mapping); >> + >> + return ret; >> +} >> +#else >> +#define pagemap_scan_hugetlb_entry NULL >> +#endif >> + >> +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, >> + int depth, struct mm_walk *walk) >> +{ >> + struct pagemap_scan_private *p = walk->private; >> + struct vm_area_struct *vma = walk->vma; >> + int ret, err; >> + >> + if (!vma) >> + return 0; >> + >> + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); >> + if (addr == end) >> + return ret; >> + >> + if (~p->arg.flags & PM_SCAN_WP_MATCHING) >> + return ret; >> + >> + err = uffd_wp_range(vma, addr, end - addr, true); >> + if (err < 0) >> + ret = err; >> + >> + return ret; >> +} >> + >> +static const struct mm_walk_ops pagemap_scan_ops = { >> + .test_walk = pagemap_scan_test_walk, >> + .pmd_entry = pagemap_scan_pmd_entry, >> + .pte_hole = pagemap_scan_pte_hole, >> + .hugetlb_entry = pagemap_scan_hugetlb_entry, >> +}; >> + >> +static int pagemap_scan_get_args(struct pm_scan_arg *arg, >> + unsigned long uarg) >> +{ >> + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) >> + return -EFAULT; >> + >> + if (arg->size != sizeof(struct pm_scan_arg)) >> + return -EINVAL; >> + >> + /* Validate requested features */ >> + if (arg->flags & ~PM_SCAN_FLAGS) >> + return -EINVAL; >> + if ((arg->category_inverted | arg->category_mask | >> + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) >> + return -EINVAL; >> + >> + arg->start = untagged_addr((unsigned long)arg->start); >> + arg->end = untagged_addr((unsigned long)arg->end); >> + arg->vec = untagged_addr((unsigned long)arg->vec); >> + >> + /* Validate memory pointers */ >> + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) >> + return -EINVAL; >> + if (!access_ok((void __user *)arg->start, arg->end - arg->start)) >> + return -EFAULT; >> + if (!arg->vec && arg->vec_len) >> + return -EFAULT; >> + if (arg->vec && !access_ok((void __user *)arg->vec, >> + arg->vec_len * sizeof(struct page_region))) >> + return -EFAULT; >> + >> + /* Fixup default values */ >> + arg->end = ALIGN(arg->end, PAGE_SIZE); >> + if (!arg->max_pages) >> + arg->max_pages = ULONG_MAX; >> + >> + return 0; >> +} >> + >> +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, >> + unsigned long uargl) >> +{ >> + struct pm_scan_arg __user *uarg = (void __user *)uargl; >> + >> + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) >> + return -EFAULT; >> + >> + return 0; >> +} >> + >> +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) >> +{ >> + if (!p->arg.vec_len) { >> + /* >> + * An arbitrary non-page-aligned sentinel value for >> + * pagemap_scan_push_range(). >> + */ >> + p->cur_buf.start = p->cur_buf.end = ULLONG_MAX; >> + if (p->arg.vec) >> + p->vec_buf = ZERO_SIZE_PTR; >> + return 0; >> + } >> + >> + /* >> + * Allocate a smaller buffer to get output from inside the page >> + * walk functions and walk the range in PAGEMAP_WALK_SIZE chunks. >> + * The last range is always stored in p.cur_buf to allow coalescing >> + * consecutive ranges that have the same categories returned across >> + * walk_page_range() calls. >> + */ >> + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, >> + p->arg.vec_len - 1); >> + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), >> + GFP_KERNEL); >> + if (!p->vec_buf) >> + return -ENOMEM; >> + >> + p->vec_out = (struct page_region __user *)p->arg.vec; >> + >> + return 0; >> +} >> + >> +static int pagemap_scan_flush_buffer(struct pagemap_scan_private *p) >> +{ >> + const struct page_region *buf = p->vec_buf; >> + int n = (int)p->vec_buf_index; >> + >> + if (!n) >> + return 0; >> + >> + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) >> + return -EFAULT; >> + >> + p->arg.vec_len -= n; >> + p->vec_out += n; >> + >> + p->vec_buf_index = 0; >> + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len - 1); >> + >> + return n; >> +} >> + >> +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) >> +{ >> + unsigned long walk_start, walk_end; >> + struct mmu_notifier_range range; >> + struct pagemap_scan_private p; >> + size_t n_ranges_out = 0; >> + int ret; >> + >> + memset(&p, 0, sizeof(p)); >> + ret = pagemap_scan_get_args(&p.arg, uarg); >> + if (ret) >> + return ret; >> + >> + ret = pagemap_scan_init_bounce_buffer(&p); >> + if (ret) >> + return ret; >> + >> + /* Protection change for the range is going to happen. */ >> + if (p.arg.flags & PM_SCAN_WP_MATCHING) { >> + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, >> + mm, p.arg.start, p.arg.end); >> + mmu_notifier_invalidate_range_start(&range); >> + } >> + >> + walk_start = walk_end = p.arg.start; >> + for (; walk_end != p.arg.end; walk_start = walk_end) { >> + int n_out; >> + >> + walk_end = min_t(unsigned long, >> + (walk_start + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK, >> + p.arg.end); >> + > > if (fatal_signal_pending(current)) { > ret = EINTR; > break; > } > >> + ret = mmap_read_lock_killable(mm); >> + if (ret) >> + break; >> + ret = walk_page_range(mm, walk_start, walk_end, >> + &pagemap_scan_ops, &p); >> + mmap_read_unlock(mm); >> + >> + n_out = pagemap_scan_flush_buffer(&p); >> + if (n_out < 0) >> + ret = n_out; >> + else >> + n_ranges_out += n_out; >> + >> + if (ret) >> + break; >> + } >> + >> + if (p.cur_buf.start != p.cur_buf.end) { >> + if (copy_to_user(p.vec_out, &p.cur_buf, sizeof(p.cur_buf))) >> + ret = -EFAULT; >> + else >> + ++n_ranges_out; >> + } >> + >> + /* ENOSPC signifies early stop (buffer full) from the walk. */ >> + if (!ret || ret == -ENOSPC) >> + ret = n_ranges_out; >> + >> + p.arg.walk_end = p.end_addr ? p.end_addr : walk_start; >> + if (pagemap_scan_writeback_args(&p.arg, uarg)) >> + ret = -EFAULT; >> + >> + if (p.arg.flags & PM_SCAN_WP_MATCHING) >> + mmu_notifier_invalidate_range_end(&range); >> + >> + kfree(p.vec_buf); >> + return ret; >> +} >> + >> +static long do_pagemap_cmd(struct file *file, unsigned int cmd, >> + unsigned long arg) >> +{ >> + struct mm_struct *mm = file->private_data; >> + >> + switch (cmd) { >> + case PAGEMAP_SCAN: >> + return do_pagemap_scan(mm, arg); >> + >> + default: >> + return -EINVAL; >> + } >> +} >> + >> const struct file_operations proc_pagemap_operations = { >> .llseek = mem_lseek, /* borrow this */ >> .read = pagemap_read, >> .open = pagemap_open, >> .release = pagemap_release, >> + .unlocked_ioctl = do_pagemap_cmd, >> + .compat_ioctl = do_pagemap_cmd, >> }; >> #endif /* CONFIG_PROC_PAGE_MONITOR */ >> >> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h >> index 0a393bc02f25b..8f8ff07453f22 100644 >> --- a/include/linux/hugetlb.h >> +++ b/include/linux/hugetlb.h >> @@ -259,6 +259,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, >> unsigned long cp_flags); >> >> bool is_hugetlb_entry_migration(pte_t pte); >> +bool is_hugetlb_entry_hwpoisoned(pte_t pte); >> void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); >> >> #else /* !CONFIG_HUGETLB_PAGE */ >> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h >> index b7b56871029c5..1bb3c625c2381 100644 >> --- a/include/uapi/linux/fs.h >> +++ b/include/uapi/linux/fs.h >> @@ -305,4 +305,62 @@ typedef int __bitwise __kernel_rwf_t; >> #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ >> RWF_APPEND) >> >> +/* Pagemap ioctl */ >> +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) >> + >> +/* Bits are set in flags of the page_region and masks in pm_scan_args */ >> +#define PAGE_IS_WPALLOWED (1 << 0) >> +#define PAGE_IS_WRITTEN (1 << 1) >> +#define PAGE_IS_FILE (1 << 2) >> +#define PAGE_IS_PRESENT (1 << 3) >> +#define PAGE_IS_SWAPPED (1 << 4) >> +#define PAGE_IS_PFNZERO (1 << 5) >> + >> +/* >> + * struct page_region - Page region with flags >> + * @start: Start of the region >> + * @end: End of the region (exclusive) >> + * @categories: PAGE_IS_* category bitmask for the region >> + */ >> +struct page_region { >> + __u64 start; >> + __u64 end; >> + __u64 categories; >> +}; >> + >> +/* Flags for PAGEMAP_SCAN ioctl */ >> +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ >> +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ >> + >> +/* >> + * struct pm_scan_arg - Pagemap ioctl argument >> + * @size: Size of the structure >> + * @flags: Flags for the IOCTL >> + * @start: Starting address of the region >> + * @end: Ending address of the region >> + * @walk_end: Ending address of the visited memory is returned >> + * (This helps if entire range hasn't been visited) >> + * @vec: Address of page_region struct array for output >> + * @vec_len: Length of the page_region struct array >> + * @max_pages: Optional limit for number of returned pages (0 = disabled) >> + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 >> + * @category_mask: Skip pages for which any category doesn't match >> + * @category_anyof_mask: Skip pages for which no category matches >> + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned >> + */ >> +struct pm_scan_arg { >> + __u64 size; >> + __u64 flags; >> + __u64 start; >> + __u64 end; >> + __u64 walk_end; >> + __u64 vec; >> + __u64 vec_len; >> + __u64 max_pages; >> + __u64 category_inverted; >> + __u64 category_mask; >> + __u64 category_anyof_mask; >> + __u64 return_mask; >> +}; >> + >> #endif /* _UAPI_LINUX_FS_H */ >> diff --git a/mm/hugetlb.c b/mm/hugetlb.c >> index a073e6ed8900b..3b07db0a4f2d9 100644 >> --- a/mm/hugetlb.c >> +++ b/mm/hugetlb.c >> @@ -5008,7 +5008,7 @@ bool is_hugetlb_entry_migration(pte_t pte) >> return false; >> } >> >> -static bool is_hugetlb_entry_hwpoisoned(pte_t pte) >> +bool is_hugetlb_entry_hwpoisoned(pte_t pte) >> { >> swp_entry_t swp; >> >> -- >> 2.39.2 >> -- BR, Muhammad Usama Anjum