Re: [PATCH v29 2/6] fs/proc/task_mmu: Implement IOCTL to get and optionally clear info about PTEs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/12/23 8:49 PM, Michał Mirosław wrote:
> On Fri, Aug 11, 2023 at 11:08:38PM +0500, Muhammad Usama Anjum wrote:
>> The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally
>> clear the info about page table entries. The following operations are supported
>> in this IOCTL:
>> - Scan the address range and get the memory ranges matching the provided criteria.
>>   This is performed by default when the output buffer is specified.
> 
> Nit: This is actually performed always, but you can disable the output part
> by passing {NULL, 0} for the buffer.
I'll update it to:
"This is performed when the output buffer is specified."

> 
> [...]
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -19,6 +19,8 @@
>>  #include <linux/shmem_fs.h>
>>  #include <linux/uaccess.h>
>>  #include <linux/pkeys.h>
>> +#include <linux/minmax.h>
>> +#include <linux/overflow.h>
>>  
>>  #include <asm/elf.h>
>>  #include <asm/tlb.h>
>> @@ -1749,11 +1751,682 @@ static int pagemap_release(struct inode *inode, struct file *file)
>>  	return 0;
>>  }
>>  
>> +#define PM_SCAN_CATEGORIES	(PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN |	\
>> +				 PAGE_IS_FILE |	PAGE_IS_PRESENT |	\
>> +				 PAGE_IS_SWAPPED | PAGE_IS_PFNZERO |	\
>> +				 PAGE_IS_HUGE)
>> +#define PM_SCAN_FLAGS		(PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
>> +
>> +struct pagemap_scan_private {
>> +	struct pm_scan_arg arg;
>> +	unsigned long masks_of_interest, cur_vma_category;
>> +	struct page_region *vec_buf;
>> +	unsigned long vec_buf_len, vec_buf_index, found_pages, walk_end_addr;
>> +	struct page_region __user *vec_out;
>> +};
> [...]
>> +static unsigned long pagemap_thp_category(pmd_t pmd)
>> +{
>> +	unsigned long categories = PAGE_IS_HUGE;
>> +
>> +	/*
>> +	 * THPs don't support file-backed memory. So PAGE_IS_FILE
>> +	 * hasn't been checked here.
> 
> "hasn't been" -> "is not"
> (same for HugeTLB comment)
I'll update.

> 
>> +static bool pagemap_scan_push_range(unsigned long categories,
>> +				    struct pagemap_scan_private *p,
>> +				    unsigned long addr, unsigned long end)
>> +{
>> +	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
>> +
>> +	/*
>> +	 * When there is no output buffer provided at all, the sentinel values
>> +	 * won't match here. There is no other way for `cur_buf->end` to be
>> +	 * non-zero other than it being non-empty.
>> +	 */
>> +	if (addr == cur_buf->end && categories == cur_buf->categories) {
>> +		cur_buf->end = end;
>> +		return true;
>> +	}
>> +
>> +	if (cur_buf->end) {
>> +		if (p->vec_buf_index >= p->vec_buf_len - 1)
>> +			return false;
>> +
>> +		cur_buf = &p->vec_buf[++p->vec_buf_index];
>> +	}
>> +
>> +	cur_buf->start = addr;
>> +	cur_buf->end = end;
>> +	cur_buf->categories = categories;
>> +
>> +	return true;
>> +}
>> +
>> +static void pagemap_scan_backout_range(struct pagemap_scan_private *p,
>> +				       unsigned long addr, unsigned long end)
>> +{
>> +	struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index];
>> +
>> +	if (cur_buf->start != addr) {
>> +		cur_buf->end = addr;
>> +	} else {
>> +		cur_buf->start = cur_buf->end = 0;
>> +		if (p->vec_buf_index > 0)
>> +			p->vec_buf_index--;
> 
> There is no need to move to the previous index, as if the walk ends at
> this moment, the flush_buffer() code will ignore the empty last range.
Yeah, I'll update.

> 
>> +	}
>> +
>> +	p->found_pages -= (end - addr) / PAGE_SIZE;
>> +}
>> +
>> +static int pagemap_scan_output(unsigned long categories,
>> +			       struct pagemap_scan_private *p,
>> +			       unsigned long addr, unsigned long *end)
>> +{
>> +	unsigned long n_pages, total_pages;
>> +	int ret = 0;
>> +
>> +	if (!p->vec_buf)
>> +		return 0;
>> +
>> +	categories &= p->arg.return_mask;
>> +
>> +	n_pages = (*end - addr) / PAGE_SIZE;
>> +	if (check_add_overflow(p->found_pages, n_pages, &total_pages) ||
>> +	    total_pages > p->arg.max_pages) {
>> +		size_t n_too_much = total_pages - p->arg.max_pages;
>> +		*end -= n_too_much * PAGE_SIZE;
>> +		n_pages -= n_too_much;
>> +		ret = -ENOSPC;
>> +	}
>> +
>> +	if (!pagemap_scan_push_range(categories, p, addr, *end)) {
>> +		*end = addr;
>> +		n_pages = 0;
>> +		ret = -ENOSPC;
>> +	}
>> +
>> +	p->found_pages += n_pages;
>> +	if (ret)
>> +		p->walk_end_addr = *end;
>> +
>> +	return ret;
>> +}
> [...]
>> +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p)
>> +{
>> +	if (!p->arg.vec_len)
>> +		return 0;
> 
> The removal of `cur_buf` lost the case of empty non-NULL output buffer
> passed in args.  That was requesting the walk to stop at first matching
> page (with the address returned in `walk_end`).  The push_range() call
> is still checking that, but since neither the buffer nor the sentinel
> values are set, the case is not possible to invoke.
Yeah, this is why I've removed all that logic here. The vec_len is set to 0
and vec_buf to NULL. This handles all the cases.

> 
>> +	/*
>> +	 * Allocate a smaller buffer to get output from inside the page
>> +	 * walk functions and walk the range in PAGEMAP_WALK_SIZE chunks.
>> +	 */
> 
> I think this is no longer true? We can now allocate arbitrary number of
> entries, but should probably have at least 512 to cover one PMD of pages.
> So it would be better to have a constant that holds the number of
> entries in the bounce buffer.
I'll remove the comment. PAGEMAP_WALK_SIZE >> PAGE_SHIFT is a constant
already, just a fancy one.

Altough if we can increase 512 to bigger number, it'll be better in terms
of performance. I'm not sure how much we can increase it.

> 
>> +	p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT,
>> +			       p->arg.vec_len);
>> +	p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf),
>> +				   GFP_KERNEL);
>> +	if (!p->vec_buf)
>> +		return -ENOMEM;
>> +
>> +	p->vec_buf[0].end = 0;
> 
> p->vec_buf->start = p->vec_buf->end = 0;
Sure.

> 
>> +	p->vec_out = (struct page_region __user *)p->arg.vec;
>> +
>> +	return 0;
>> +}
>> +
>> +static int pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
>> +{
>> +	const struct page_region *buf = p->vec_buf;
>> +	int n = (int)p->vec_buf_index;
> 
> Why do you need an `int` here (requiring a cast)?
Just looked at it, n and return code of pagemap_scan_flush_buffer() should
be long. Changed.

> 
>> +	if (p->arg.vec_len == 0)
>> +		return 0;
> 
> This should be actually `if (!buf)` as this notes that we don't have any
> buffer allocated (due to no output requested).
I'll update. !buf seems more reasonable.

> 
>> +	if (buf[n].end && buf[n].end != buf[n].start)
>> +		n++;
> 
> Testing `buf[n].end` is redundant, as the range is nonempty if
> `end != start`.
Sure.

> 
>> +	if (!n)
>> +		return 0;
>> +
>> +	if (copy_to_user(p->vec_out, buf, n * sizeof(*buf)))
>> +		return -EFAULT;
>> +
>> +	p->arg.vec_len -= n;
>> +	p->vec_out += n;
>> +
>> +	p->vec_buf_index = 0;
>> +	p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len);
>> +	p->vec_buf[0].end = 0;
> 
> buf->start = buf->end = 0;
Sure.

> 
>> +	return n;
>> +}
>> +
>> +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
>> +{
>> +	struct mmu_notifier_range range;
>> +	struct pagemap_scan_private p = {0};
>> +	unsigned long walk_start;
>> +	size_t n_ranges_out = 0;
>> +	int ret;
>> +
>> +	ret = pagemap_scan_get_args(&p.arg, uarg);
>> +	if (ret)
>> +		return ret;
>> +
>> +	p.masks_of_interest = p.arg.category_inverted | p.arg.category_mask |
>> +			      p.arg.category_anyof_mask | p.arg.return_mask;
> 
> `category_inverted` can be left out, because if a set bit it is not also in one
> of the masks, then its value is going to be ignored.
Okay.

> 
> [...]
>> +	for (walk_start = p.arg.start; walk_start < p.arg.end;
>> +			walk_start = p.arg.walk_end) {
>> +		int n_out;
>> +
>> +		if (fatal_signal_pending(current)) {
>> +			ret = -EINTR;
>> +			break;
>> +		}
>> +
>> +		ret = mmap_read_lock_killable(mm);
>> +		if (ret)
>> +			break;
>> +		ret = walk_page_range(mm, walk_start, p.arg.end,
>> +				      &pagemap_scan_ops, &p);
>> +		mmap_read_unlock(mm);
>> +
>> +		n_out = pagemap_scan_flush_buffer(&p);
>> +		if (n_out < 0)
>> +			ret = n_out;
>> +		else
>> +			n_ranges_out += n_out;
>> +
>> +		p.walk_end_addr = p.walk_end_addr ? p.walk_end_addr : p.arg.end;
> 
> Why is `p.walk_end_addr` needed? It is not used in the loop code. Shoudn't
> it be `p.arg.walk_end` as used in the `for` loop continuation statement?
It isn't needed for the loop. But we need to note down the ending address
of walk. We can switch to using p.arg.walk_end for better logical reason.
I'll update code.

> 
>> +		if (ret != -ENOSPC || p.arg.vec_len == 0 ||
>> +		    p.found_pages == p.arg.max_pages)
>> +			break;
> 
> Nit: I think you could split this into two or three separate `if (x)
> break;` for easier reading. The `vec_len` and `found_pages` are
> buffer-full tests, so could go along, but `ret != ENOSPC` is checking an
> error condition aborting the scan before it ends.
Can be done.

> 
>> +	}
>> +
>> +	/* ENOSPC signifies early stop (buffer full) from the walk. */
>> +	if (!ret || ret == -ENOSPC)
>> +		ret = n_ranges_out;
>> +
>> +	p.arg.walk_end = p.walk_end_addr ? p.walk_end_addr : walk_start;
>> +	if (pagemap_scan_writeback_args(&p.arg, uarg))
>> +		ret = -EFAULT;
> [...]
>> --- a/include/uapi/linux/fs.h
>> +++ b/include/uapi/linux/fs.h
> [...]
>> +/*
>> + * struct pm_scan_arg - Pagemap ioctl argument
>> + * @size:		Size of the structure
>> + * @flags:		Flags for the IOCTL
>> + * @start:		Starting address of the region
>> + * @end:		Ending address of the region
>> + * @walk_end		Address where the scan stopped (written by kernel).
>> + *			walk_end == end (tag removed) informs that the scan completed on entire range.
> 
> I'm not sure `tag removed` is enough to know what tag was removed.
> Maybe something like "with address tags cleared" would fit?
Okay.

> 
> Best Regards
> Michał Mirosław

-- 
BR,
Muhammad Usama Anjum




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux