Re: [PATCH v26 2/5] fs/proc/task_mmu: Implement IOCTL to get and optionally clear info about PTEs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Jul 27, 2023 at 02:36:34PM +0500, Muhammad Usama Anjum wrote:

<snip>

> +
> +static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
> +				       unsigned long addr, unsigned long end,
> +				       unsigned long end_addr)

It hard to figure out what difference between end and end_addr. I would
add a comment here.

> +{
> +	struct page_region *cur_buf = &p->cur_buf;
> +
> +	if (cur_buf->start != addr)
> +		cur_buf->end = addr;
> +	else
> +		cur_buf->start = cur_buf->end = 0;
> +
> +	p->end_addr = end_addr;
> +	p->found_pages -= (end - addr) / PAGE_SIZE;
> +}
> +
> +static int pagemap_scan_output(unsigned long categories,
> +			       struct pagemap_scan_private *p,
> +			       unsigned long addr, unsigned long *end)
> +{
> +	unsigned long n_pages, total_pages;
> +	int ret = 0;
> +
> +	if (!pagemap_scan_is_interesting_page(categories, p)) {
> +		*end = addr;
> +		return 0;
> +	}
> +
> +	if (!p->vec_buf)
> +		return 0;
> +
> +	categories &= p->arg.return_mask;
> +
> +	n_pages = (*end - addr) / PAGE_SIZE;
> +	if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
> +	    total_pages > p->arg.max_pages) {

why do we need to use check_add_overflow here?

> +		size_t n_too_much = total_pages - p->arg.max_pages;

it is unsafe to use total_pages if check_add_overflow returns non-zero.

> +		*end -= n_too_much * PAGE_SIZE;
> +		n_pages -= n_too_much;
> +		ret = -ENOSPC;
> +	}
> +
> +	if (!pagemap_scan_push_range(categories, p, addr, *end)) {
> +		*end = addr;
> +		n_pages = 0;
> +		ret = -ENOSPC;
> +	}
> +
> +	p->found_pages += n_pages;
> +	if (ret)
> +		p->end_addr = *end;
> +
> +	return ret;
> +}
> +
> +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start,
> +				  unsigned long end, struct mm_walk *walk)
> +{
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned long categories;
> +	spinlock_t *ptl;
> +	int ret = 0;
> +
> +	ptl = pmd_trans_huge_lock(pmd, vma);
> +	if (!ptl)
> +		return -ENOENT;
> +
> +	categories = p->cur_vma_category | pagemap_thp_category(*pmd);
> +
> +	ret = pagemap_scan_output(categories, p, start, &end);
> +	if (start == end)
> +		goto out_unlock;
> +
> +	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
> +		goto out_unlock;
> +	if (~categories & PAGE_IS_WRITTEN)
> +		goto out_unlock;
> +
> +	/*
> +	 * Break huge page into small pages if the WP operation
> +	 * need to be performed is on a portion of the huge page.
> +	 */
> +	if (end != start + HPAGE_SIZE) {
> +		spin_unlock(ptl);
> +		split_huge_pmd(vma, pmd, start);
> +		pagemap_scan_backout_range(p, start, end, 0);

pagemap_scan_backout_range looks "weird"... imho, it makes the code
harder for understanding.

> +		return -ENOENT;

I think you need to add a comment that this ENOENT is a special case.

> +	}
> +
> +	make_uffd_wp_pmd(vma, start, pmd);
> +	flush_tlb_range(vma, start, end);
> +out_unlock:
> +	spin_unlock(ptl);
> +	return ret;
> +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
> +	return -ENOENT;
> +#endif
> +}
> +
> +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
> +				  unsigned long end, struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	pte_t *pte, *start_pte;
> +	unsigned long addr;
> +	bool flush = false;
> +	spinlock_t *ptl;
> +	int ret;
> +
> +	arch_enter_lazy_mmu_mode();
> +
> +	ret = pagemap_scan_thp_entry(pmd, start, end, walk);
> +	if (ret != -ENOENT) {
> +		arch_leave_lazy_mmu_mode();
> +		return ret;
> +	}
> +
> +	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
> +	if (!pte) {
> +		arch_leave_lazy_mmu_mode();
> +		walk->action = ACTION_AGAIN;
> +		return 0;
> +	}
> +
> +	for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
> +		unsigned long categories = p->cur_vma_category |
> +					   pagemap_page_category(vma, addr, ptep_get(pte));
> +		unsigned long next = addr + PAGE_SIZE;
> +
> +		ret = pagemap_scan_output(categories, p, addr, &next);
> +		if (next == addr) {
> +			if (!ret)
> +				continue;
> +			break;
> +		}
> +
> +		if (~p->arg.flags & PM_SCAN_WP_MATCHING)
> +			continue;
> +		if (~categories & PAGE_IS_WRITTEN)
> +			continue;
> +
> +		make_uffd_wp_pte(vma, addr, pte);
> +		if (!flush) {
> +			start = addr;
> +			flush = true;
> +		}
> +	}
> +
> +	if (flush)
> +		flush_tlb_range(vma, start, addr);
> +
> +	pte_unmap_unlock(start_pte, ptl);
> +	arch_leave_lazy_mmu_mode();
> +
> +	cond_resched();
> +	return ret;
> +}
> +
> +#ifdef CONFIG_HUGETLB_PAGE
> +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
> +				      unsigned long start, unsigned long end,
> +				      struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned long categories;
> +	spinlock_t *ptl;
> +	int ret = 0;
> +	pte_t pte;
> +
> +	if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
> +		/* Go the short route when not write-protecting pages. */
> +
> +		pte = huge_ptep_get(ptep);
> +		categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
> +
> +		return pagemap_scan_output(categories, p, start, &end);
> +	}
> +
> +	i_mmap_lock_write(vma->vm_file->f_mapping);
> +	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
> +
> +	pte = huge_ptep_get(ptep);
> +	categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
> +
> +	ret = pagemap_scan_output(categories, p, start, &end);
> +	if (start == end)
> +		goto out_unlock;
> +
> +	if (~categories & PAGE_IS_WRITTEN)
> +		goto out_unlock;
> +
> +	if (end != start + HPAGE_SIZE) {
> +		/* Partial HugeTLB page WP isn't possible. */
> +		pagemap_scan_backout_range(p, start, end, start);
> +		ret = -EINVAL;

Why is it EINVAL in this case?

> +		goto out_unlock;
> +	}
> +
> +	make_uffd_wp_huge_pte(vma, start, ptep, pte);
> +	flush_hugetlb_tlb_range(vma, start, end);
> +
> +out_unlock:
> +	spin_unlock(ptl);
> +	i_mmap_unlock_write(vma->vm_file->f_mapping);
> +
> +	return ret;
> +}
> +#else
> +#define pagemap_scan_hugetlb_entry NULL
> +#endif
> +
> +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end,
> +				 int depth, struct mm_walk *walk)
> +{
> +	struct pagemap_scan_private *p = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	int ret, err;
> +
> +	if (!vma)
> +		return 0;
> +
> +	ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end);
> +	if (addr == end)
> +		return ret;
> +
> +	if (~p->arg.flags & PM_SCAN_WP_MATCHING)
> +		return ret;
> +
> +	err = uffd_wp_range(vma, addr, end - addr, true);
> +	if (err < 0)
> +		ret = err;
> +
> +	return ret;
> +}
> +
> +static const struct mm_walk_ops pagemap_scan_ops = {
> +	.test_walk = pagemap_scan_test_walk,
> +	.pmd_entry = pagemap_scan_pmd_entry,
> +	.pte_hole = pagemap_scan_pte_hole,
> +	.hugetlb_entry = pagemap_scan_hugetlb_entry,
> +};
> +
> +static int pagemap_scan_get_args(struct pm_scan_arg *arg,
> +				 unsigned long uarg)
> +{
> +	if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg)))
> +		return -EFAULT;
> +
> +	if (arg->size != sizeof(struct pm_scan_arg))
> +		return -EINVAL;
> +
> +	/* Validate requested features */
> +	if (arg->flags & ~PM_SCAN_FLAGS)
> +		return -EINVAL;
> +	if ((arg->category_inverted | arg->category_mask |
> +	     arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES)
> +		return -EINVAL;
> +
> +	arg->start = untagged_addr((unsigned long)arg->start);
> +	arg->end = untagged_addr((unsigned long)arg->end);
> +	arg->vec = untagged_addr((unsigned long)arg->vec);
> +
> +	/* Validate memory pointers */
> +	if (!IS_ALIGNED(arg->start, PAGE_SIZE))
> +		return -EINVAL;
> +	if (!access_ok((void __user *)arg->start, arg->end - arg->start))
> +		return -EFAULT;
> +	if (!arg->vec && arg->vec_len)
> +		return -EFAULT;
> +	if (arg->vec && !access_ok((void __user *)arg->vec,
> +			      arg->vec_len * sizeof(struct page_region)))
> +		return -EFAULT;
> +
> +	/* Fixup default values */
> +	arg->end = ALIGN(arg->end, PAGE_SIZE);
> +	if (!arg->max_pages)
> +		arg->max_pages = ULONG_MAX;
> +
> +	return 0;
> +}
> +
> +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg,
> +				       unsigned long uargl)
> +{
> +	struct pm_scan_arg __user *uarg	= (void __user *)uargl;
> +
> +	if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
> +{
> +	if (!p->arg.vec_len) {
> +		/*
> +		 * An arbitrary non-page-aligned sentinel value for
> +		 * pagemap_scan_push_range().
> +		 */
> +		p->cur_buf.start = p->cur_buf.end = ULLONG_MAX;
> +		if (p->arg.vec)
> +			p->vec_buf = ZERO_SIZE_PTR;
> +		return 0;
> +	}
> +
> +	/*
> +	 * Allocate a smaller buffer to get output from inside the page
> +	 * walk functions and walk the range in PAGEMAP_WALK_SIZE chunks.
> +	 * The last range is always stored in p.cur_buf to allow coalescing
> +	 * consecutive ranges that have the same categories returned across
> +	 * walk_page_range() calls.
> +	 */
> +	p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
> +			       p->arg.vec_len - 1);
> +	p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
> +				   GFP_KERNEL);
> +	if (!p->vec_buf)
> +		return -ENOMEM;
> +
> +	p->vec_out = (struct page_region __user *)p->arg.vec;
> +
> +	return 0;
> +}
> +
> +static int pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
> +{
> +	const struct page_region *buf = p->vec_buf;
> +	int n = (int)p->vec_buf_index;
> +
> +	if (!n)
> +		return 0;
> +
> +	if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
> +		return -EFAULT;
> +
> +	p->arg.vec_len -= n;
> +	p->vec_out += n;
> +
> +	p->vec_buf_index = 0;
> +	p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len - 1);
> +
> +	return n;
> +}
> +
> +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
> +{
> +	unsigned long walk_start, walk_end;
> +	struct mmu_notifier_range range;
> +	struct pagemap_scan_private p;
> +	size_t n_ranges_out = 0;
> +	int ret;
> +
> +	memset(&p, 0, sizeof(p));
> +	ret = pagemap_scan_get_args(&p.arg, uarg);
> +	if (ret)
> +		return ret;
> +
> +	ret = pagemap_scan_init_bounce_buffer(&p);
> +	if (ret)
> +		return ret;
> +
> +	/* Protection change for the range is going to happen. */
> +	if (p.arg.flags & PM_SCAN_WP_MATCHING) {
> +		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
> +					mm, p.arg.start, p.arg.end);
> +		mmu_notifier_invalidate_range_start(&range);
> +	}
> +
> +	walk_start = walk_end = p.arg.start;
> +	for (; walk_end != p.arg.end; walk_start = walk_end) {
> +		int n_out;
> +
> +		walk_end = min_t(unsigned long,
> +				 (walk_start + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK,
> +				 p.arg.end);
> +

if (fatal_signal_pending(current)) {
	ret = EINTR;
	break;
}

> +		ret = mmap_read_lock_killable(mm);
> +		if (ret)
> +			break;
> +		ret = walk_page_range(mm, walk_start, walk_end,
> +				      &pagemap_scan_ops, &p);
> +		mmap_read_unlock(mm);
> +
> +		n_out = pagemap_scan_flush_buffer(&p);
> +		if (n_out < 0)
> +			ret = n_out;
> +		else
> +			n_ranges_out += n_out;
> +
> +		if (ret)
> +			break;
> +	}
> +
> +	if (p.cur_buf.start != p.cur_buf.end) {
> +		if (copy_to_user(p.vec_out, &p.cur_buf, sizeof(p.cur_buf)))
> +			ret = -EFAULT;
> +		else
> +			++n_ranges_out;
> +	}
> +
> +	/* ENOSPC signifies early stop (buffer full) from the walk. */
> +	if (!ret || ret == -ENOSPC)
> +		ret = n_ranges_out;
> +
> +	p.arg.walk_end = p.end_addr ? p.end_addr : walk_start;
> +	if (pagemap_scan_writeback_args(&p.arg, uarg))
> +		ret = -EFAULT;
> +
> +	if (p.arg.flags & PM_SCAN_WP_MATCHING)
> +		mmu_notifier_invalidate_range_end(&range);
> +
> +	kfree(p.vec_buf);
> +	return ret;
> +}
> +
> +static long do_pagemap_cmd(struct file *file, unsigned int cmd,
> +			   unsigned long arg)
> +{
> +	struct mm_struct *mm = file->private_data;
> +
> +	switch (cmd) {
> +	case PAGEMAP_SCAN:
> +		return do_pagemap_scan(mm, arg);
> +
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
>  const struct file_operations proc_pagemap_operations = {
>  	.llseek		= mem_lseek, /* borrow this */
>  	.read		= pagemap_read,
>  	.open		= pagemap_open,
>  	.release	= pagemap_release,
> +	.unlocked_ioctl = do_pagemap_cmd,
> +	.compat_ioctl	= do_pagemap_cmd,
>  };
>  #endif /* CONFIG_PROC_PAGE_MONITOR */
>  
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 0a393bc02f25b..8f8ff07453f22 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -259,6 +259,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
>  		unsigned long cp_flags);
>  
>  bool is_hugetlb_entry_migration(pte_t pte);
> +bool is_hugetlb_entry_hwpoisoned(pte_t pte);
>  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
>  
>  #else /* !CONFIG_HUGETLB_PAGE */
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index b7b56871029c5..1bb3c625c2381 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -305,4 +305,62 @@ typedef int __bitwise __kernel_rwf_t;
>  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
>  			 RWF_APPEND)
>  
> +/* Pagemap ioctl */
> +#define PAGEMAP_SCAN	_IOWR('f', 16, struct pm_scan_arg)
> +
> +/* Bits are set in flags of the page_region and masks in pm_scan_args */
> +#define PAGE_IS_WPALLOWED	(1 << 0)
> +#define PAGE_IS_WRITTEN		(1 << 1)
> +#define PAGE_IS_FILE		(1 << 2)
> +#define PAGE_IS_PRESENT		(1 << 3)
> +#define PAGE_IS_SWAPPED		(1 << 4)
> +#define PAGE_IS_PFNZERO		(1 << 5)
> +
> +/*
> + * struct page_region - Page region with flags
> + * @start:	Start of the region
> + * @end:	End of the region (exclusive)
> + * @categories:	PAGE_IS_* category bitmask for the region
> + */
> +struct page_region {
> +	__u64 start;
> +	__u64 end;
> +	__u64 categories;
> +};
> +
> +/* Flags for PAGEMAP_SCAN ioctl */
> +#define PM_SCAN_WP_MATCHING	(1 << 0)	/* Write protect the pages matched. */
> +#define PM_SCAN_CHECK_WPASYNC	(1 << 1)	/* Abort the scan when a non-WP-enabled page is found. */
> +
> +/*
> + * struct pm_scan_arg - Pagemap ioctl argument
> + * @size:		Size of the structure
> + * @flags:		Flags for the IOCTL
> + * @start:		Starting address of the region
> + * @end:		Ending address of the region
> + * @walk_end:		Ending address of the visited memory is returned
> + *			(This helps if entire range hasn't been visited)
> + * @vec:		Address of page_region struct array for output
> + * @vec_len:		Length of the page_region struct array
> + * @max_pages:		Optional limit for number of returned pages (0 = disabled)
> + * @category_inverted:	PAGE_IS_* categories which values match if 0 instead of 1
> + * @category_mask:	Skip pages for which any category doesn't match
> + * @category_anyof_mask: Skip pages for which no category matches
> + * @return_mask:	PAGE_IS_* categories that are to be reported in `page_region`s returned
> + */
> +struct pm_scan_arg {
> +	__u64 size;
> +	__u64 flags;
> +	__u64 start;
> +	__u64 end;
> +	__u64 walk_end;
> +	__u64 vec;
> +	__u64 vec_len;
> +	__u64 max_pages;
> +	__u64 category_inverted;
> +	__u64 category_mask;
> +	__u64 category_anyof_mask;
> +	__u64 return_mask;
> +};
> +
>  #endif /* _UAPI_LINUX_FS_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a073e6ed8900b..3b07db0a4f2d9 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5008,7 +5008,7 @@ bool is_hugetlb_entry_migration(pte_t pte)
>  		return false;
>  }
>  
> -static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
> +bool is_hugetlb_entry_hwpoisoned(pte_t pte)
>  {
>  	swp_entry_t swp;
>  
> -- 
> 2.39.2
> 



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux