Re: [RFC 6/7] mm: extend process_madvise syscall to support vector arrary

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[Cc linux-api]

On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> Currently, process_madvise syscall works for only one address range
> so user should call the syscall several times to give hints to
> multiple address range.

Is that a problem? How big of a problem? Any numbers?

> This patch extends process_madvise syscall to support multiple
> hints, address ranges and return vaules so user could give hints
> all at once.
> 
> struct pr_madvise_param {
>     int size;                       /* the size of this structure */
>     const struct iovec __user *vec; /* address range array */
> }
> 
> int process_madvise(int pidfd, ssize_t nr_elem,
> 		    int *behavior,
> 		    struct pr_madvise_param *results,
> 		    struct pr_madvise_param *ranges,
> 		    unsigned long flags);
> 
> - pidfd
> 
> target process fd
> 
> - nr_elem
> 
> the number of elemenent of array behavior, results, ranges
> 
> - behavior
> 
> hints for each address range in remote process so that user could
> give different hints for each range.

What is the guarantee of a single call? Do all hints get applied or the
first failure backs of? What are the atomicity guarantees?

> 
> - results
> 
> array of buffers to get results for associated remote address range
> action.
> 
> - ranges
> 
> array to buffers to have remote process's address ranges to be
> processed
> 
> - flags
> 
> extra argument for the future. It should be zero this moment.
> 
> Example)
> 
> struct pr_madvise_param {
>         int size;
>         const struct iovec *vec;
> };
> 
> int main(int argc, char *argv[])
> {
>         struct pr_madvise_param retp, rangep;
>         struct iovec result_vec[2], range_vec[2];
>         int hints[2];
>         long ret[2];
>         void *addr[2];
> 
>         pid_t pid;
>         char cmd[64] = {0,};
>         addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
>                           MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> 
>         if (MAP_FAILED == addr[0])
>                 return 1;
> 
>         addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
>                           MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> 
>         if (MAP_FAILED == addr[1])
>                 return 1;
> 
>         hints[0] = MADV_COLD;
> 	range_vec[0].iov_base = addr[0];
>         range_vec[0].iov_len = ALLOC_SIZE;
>         result_vec[0].iov_base = &ret[0];
>         result_vec[0].iov_len = sizeof(long);
> 	retp.vec = result_vec;
>         retp.size = sizeof(struct pr_madvise_param);
> 
>         hints[1] = MADV_COOL;
>         range_vec[1].iov_base = addr[1];
>         range_vec[1].iov_len = ALLOC_SIZE;
>         result_vec[1].iov_base = &ret[1];
>         result_vec[1].iov_len = sizeof(long);
>         rangep.vec = range_vec;
>         rangep.size = sizeof(struct pr_madvise_param);
> 
>         pid = fork();
>         if (!pid) {
>                 sleep(10);
>         } else {
>                 int pidfd = open(cmd,  O_DIRECTORY | O_CLOEXEC);
>                 if (pidfd < 0)
>                         return 1;
> 
>                 /* munmap to make pages private for the child */
>                 munmap(addr[0], ALLOC_SIZE);
>                 munmap(addr[1], ALLOC_SIZE);
>                 system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
>                 if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
> 						&retp, &rangep, 0))
>                         perror("process_madvise fail\n");
>                 system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
>         }
> 
>         return 0;
> }
> 
> Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
> ---
>  include/uapi/asm-generic/mman-common.h |   5 +
>  mm/madvise.c                           | 184 +++++++++++++++++++++----
>  2 files changed, 166 insertions(+), 23 deletions(-)
> 
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index b9b51eeb8e1a..b8e230de84a6 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -74,4 +74,9 @@
>  #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
>  				 PKEY_DISABLE_WRITE)
>  
> +struct pr_madvise_param {
> +	int size;			/* the size of this structure */
> +	const struct iovec __user *vec;	/* address range array */
> +};
> +
>  #endif /* __ASM_GENERIC_MMAN_COMMON_H */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index af02aa17e5c1..f4f569dac2bd 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  	struct page *page;
>  	struct vm_area_struct *vma = walk->vma;
>  	unsigned long next;
> +	long nr_pages = 0;
>  
>  	next = pmd_addr_end(addr, end);
>  	if (pmd_trans_huge(*pmd)) {
> @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  
>  		ptep_test_and_clear_young(vma, addr, pte);
>  		deactivate_page(page);
> +		nr_pages++;
> +
>  	}
>  
>  	pte_unmap_unlock(orig_pte, ptl);
> +	*(long *)walk->private += nr_pages;
>  	cond_resched();
>  
>  	return 0;
> @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  
>  static void madvise_cool_page_range(struct mmu_gather *tlb,
>  			     struct vm_area_struct *vma,
> -			     unsigned long addr, unsigned long end)
> +			     unsigned long addr, unsigned long end,
> +			     long *nr_pages)
>  {
>  	struct mm_walk cool_walk = {
>  		.pmd_entry = madvise_cool_pte_range,
>  		.mm = vma->vm_mm,
> +		.private = nr_pages
>  	};
>  
>  	tlb_start_vma(tlb, vma);
> @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,
>  }
>  
>  static long madvise_cool(struct vm_area_struct *vma,
> -			unsigned long start_addr, unsigned long end_addr)
> +			unsigned long start_addr, unsigned long end_addr,
> +			long *nr_pages)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct mmu_gather tlb;
> @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,
>  
>  	lru_add_drain();
>  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> -	madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
> +	madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
>  	tlb_finish_mmu(&tlb, start_addr, end_addr);
>  
>  	return 0;
> @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  	int isolated = 0;
>  	struct vm_area_struct *vma = walk->vma;
>  	unsigned long next;
> +	long nr_pages = 0;
>  
>  	next = pmd_addr_end(addr, end);
>  	if (pmd_trans_huge(*pmd)) {
> @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  		list_add(&page->lru, &page_list);
>  		if (isolated >= SWAP_CLUSTER_MAX) {
>  			pte_unmap_unlock(orig_pte, ptl);
> -			reclaim_pages(&page_list);
> +			nr_pages += reclaim_pages(&page_list);
>  			isolated = 0;
>  			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>  			orig_pte = pte;
> @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  	}
>  
>  	pte_unmap_unlock(orig_pte, ptl);
> -	reclaim_pages(&page_list);
> +	nr_pages += reclaim_pages(&page_list);
>  	cond_resched();
>  
> +	*(long *)walk->private += nr_pages;
>  	return 0;
>  }
>  
>  static void madvise_cold_page_range(struct mmu_gather *tlb,
>  			     struct vm_area_struct *vma,
> -			     unsigned long addr, unsigned long end)
> +			     unsigned long addr, unsigned long end,
> +			     long *nr_pages)
>  {
>  	struct mm_walk warm_walk = {
>  		.pmd_entry = madvise_cold_pte_range,
>  		.mm = vma->vm_mm,
> +		.private = nr_pages,
>  	};
>  
>  	tlb_start_vma(tlb, vma);
> @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
>  
>  
>  static long madvise_cold(struct vm_area_struct *vma,
> -			unsigned long start_addr, unsigned long end_addr)
> +			unsigned long start_addr, unsigned long end_addr,
> +			long *nr_pages)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct mmu_gather tlb;
> @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,
>  
>  	lru_add_drain();
>  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> -	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
> +	madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
>  	tlb_finish_mmu(&tlb, start_addr, end_addr);
>  
>  	return 0;
> @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,
>  static long
>  madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
>  		struct vm_area_struct **prev, unsigned long start,
> -		unsigned long end, int behavior)
> +		unsigned long end, int behavior, long *nr_pages)
>  {
>  	switch (behavior) {
>  	case MADV_REMOVE:
> @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
>  	case MADV_WILLNEED:
>  		return madvise_willneed(vma, prev, start, end);
>  	case MADV_COOL:
> -		return madvise_cool(vma, start, end);
> +		return madvise_cool(vma, start, end, nr_pages);
>  	case MADV_COLD:
> -		return madvise_cold(vma, start, end);
> +		return madvise_cold(vma, start, end, nr_pages);
>  	case MADV_FREE:
>  	case MADV_DONTNEED:
>  		return madvise_dontneed_free(tsk, vma, prev, start,
> @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)
>  }
>  
>  static int madvise_core(struct task_struct *tsk, unsigned long start,
> -			size_t len_in, int behavior)
> +			size_t len_in, int behavior, long *nr_pages)
>  {
>  	unsigned long end, tmp;
>  	struct vm_area_struct *vma, *prev;
> @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  
>  	if (start & ~PAGE_MASK)
>  		return error;
> +
>  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
>  
>  	/* Check to see whether len was rounded up from small -ve to zero */
> @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  	blk_start_plug(&plug);
>  	for (;;) {
>  		/* Still start < end. */
> +		long pages = 0;
> +
>  		error = -ENOMEM;
>  		if (!vma)
>  			goto out;
> @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  			tmp = end;
>  
>  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
> -		error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
> +		error = madvise_vma(tsk, vma, &prev, start, tmp,
> +					behavior, &pages);
>  		if (error)
>  			goto out;
> +		*nr_pages += pages;
>  		start = tmp;
>  		if (prev && start < prev->vm_end)
>  			start = prev->vm_end;
> @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>   */
>  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>  {
> -	return madvise_core(current, start, len_in, behavior);
> +	unsigned long dummy;
> +
> +	return madvise_core(current, start, len_in, behavior, &dummy);
>  }
>  
> -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> -		size_t, len_in, int, behavior)
> +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
> +		struct pr_madvise_param *param)
> +{
> +	u32 size;
> +	int ret;
> +
> +	memset(param, 0, sizeof(*param));
> +
> +	ret = get_user(size, &u_param->size);
> +	if (ret)
> +		return ret;
> +
> +	if (size > PAGE_SIZE)
> +		return -E2BIG;
> +
> +	if (!size || size > sizeof(struct pr_madvise_param))
> +		return -EINVAL;
> +
> +	ret = copy_from_user(param, u_param, size);
> +	if (ret)
> +		return -EFAULT;
> +
> +	return ret;
> +}
> +
> +static int process_madvise_core(struct task_struct *tsk, int *behaviors,
> +				struct iov_iter *iter,
> +				const struct iovec *range_vec,
> +				unsigned long riovcnt,
> +				unsigned long flags)
> +{
> +	int i;
> +	long err;
> +
> +	for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
> +		long ret = 0;
> +
> +		err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
> +				range_vec[i].iov_len, behaviors[i],
> +				&ret);
> +		if (err)
> +			ret = err;
> +
> +		if (copy_to_iter(&ret, sizeof(long), iter) !=
> +				sizeof(long)) {
> +			err = -EFAULT;
> +			break;
> +		}
> +
> +		err = 0;
> +	}
> +
> +	return err;
> +}
> +
> +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
> +			const int __user *, hints,
> +			struct pr_madvise_param __user *, results,
> +			struct pr_madvise_param __user *, ranges,
> +			unsigned long, flags)
>  {
>  	int ret;
>  	struct fd f;
>  	struct pid *pid;
>  	struct task_struct *tsk;
>  	struct mm_struct *mm;
> +	struct pr_madvise_param result_p, range_p;
> +	const struct iovec __user *result_vec, __user *range_vec;
> +	int *behaviors;
> +	struct iovec iovstack_result[UIO_FASTIOV];
> +	struct iovec iovstack_r[UIO_FASTIOV];
> +	struct iovec *iov_l = iovstack_result;
> +	struct iovec *iov_r = iovstack_r;
> +	struct iov_iter iter;
> +
> +	if (flags != 0)
> +		return -EINVAL;
> +
> +	ret = pr_madvise_copy_param(results, &result_p);
> +	if (ret)
> +		return ret;
> +
> +	ret = pr_madvise_copy_param(ranges, &range_p);
> +	if (ret)
> +		return ret;
> +
> +	result_vec = result_p.vec;
> +	range_vec = range_p.vec;
> +
> +	if (result_p.size != sizeof(struct pr_madvise_param) ||
> +			range_p.size != sizeof(struct pr_madvise_param))
> +		return -EINVAL;
> +
> +	behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
> +	if (!behaviors)
> +		return -ENOMEM;
> +
> +	ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
> +	if (ret < 0)
> +		goto free_behavior_vec;
> +
> +	ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
> +				&iov_l, &iter);
> +	if (ret < 0)
> +		goto free_behavior_vec;
> +
> +	if (!iov_iter_count(&iter)) {
> +		ret = -EINVAL;
> +		goto free_iovecs;
> +	}
> +
> +	ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
> +				UIO_FASTIOV, iovstack_r, &iov_r);
> +	if (ret <= 0)
> +		goto free_iovecs;
>  
>  	f = fdget(pidfd);
> -	if (!f.file)
> -		return -EBADF;
> +	if (!f.file) {
> +		ret = -EBADF;
> +		goto free_iovecs;
> +	}
>  
>  	pid = pidfd_to_pid(f.file);
>  	if (IS_ERR(pid)) {
>  		ret = PTR_ERR(pid);
> -		goto err;
> +		goto put_fd;
>  	}
>  
>  	ret = -EINVAL;
> @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
>  	tsk = pid_task(pid, PIDTYPE_PID);
>  	if (!tsk) {
>  		rcu_read_unlock();
> -		goto err;
> +		goto put_fd;
>  	}
>  	get_task_struct(tsk);
>  	rcu_read_unlock();
> @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
>  		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
>  		if (ret == -EACCES)
>  			ret = -EPERM;
> -		goto err;
> +		goto put_task;
>  	}
> -	ret = madvise_core(tsk, start, len_in, behavior);
> +
> +	ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
> +					nr_elem, flags);
>  	mmput(mm);
> +put_task:
>  	put_task_struct(tsk);
> -err:
> +put_fd:
>  	fdput(f);
> +free_iovecs:
> +	if (iov_r != iovstack_r)
> +		kfree(iov_r);
> +	kfree(iov_l);
> +free_behavior_vec:
> +	kfree(behaviors);
> +
>  	return ret;
>  }
> -- 
> 2.21.0.1020.gf2820cf01a-goog
> 

-- 
Michal Hocko
SUSE Labs




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux