[RFC 6/7] mm: extend process_madvise syscall to support vector arrary

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Currently, process_madvise syscall works for only one address range
so user should call the syscall several times to give hints to
multiple address range.

This patch extends process_madvise syscall to support multiple
hints, address ranges and return vaules so user could give hints
all at once.

struct pr_madvise_param {
    int size;                       /* the size of this structure */
    const struct iovec __user *vec; /* address range array */
}

int process_madvise(int pidfd, ssize_t nr_elem,
		    int *behavior,
		    struct pr_madvise_param *results,
		    struct pr_madvise_param *ranges,
		    unsigned long flags);

- pidfd

target process fd

- nr_elem

the number of elemenent of array behavior, results, ranges

- behavior

hints for each address range in remote process so that user could
give different hints for each range.

- results

array of buffers to get results for associated remote address range
action.

- ranges

array to buffers to have remote process's address ranges to be
processed

- flags

extra argument for the future. It should be zero this moment.

Example)

struct pr_madvise_param {
        int size;
        const struct iovec *vec;
};

int main(int argc, char *argv[])
{
        struct pr_madvise_param retp, rangep;
        struct iovec result_vec[2], range_vec[2];
        int hints[2];
        long ret[2];
        void *addr[2];

        pid_t pid;
        char cmd[64] = {0,};
        addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
                          MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);

        if (MAP_FAILED == addr[0])
                return 1;

        addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
                          MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);

        if (MAP_FAILED == addr[1])
                return 1;

        hints[0] = MADV_COLD;
	range_vec[0].iov_base = addr[0];
        range_vec[0].iov_len = ALLOC_SIZE;
        result_vec[0].iov_base = &ret[0];
        result_vec[0].iov_len = sizeof(long);
	retp.vec = result_vec;
        retp.size = sizeof(struct pr_madvise_param);

        hints[1] = MADV_COOL;
        range_vec[1].iov_base = addr[1];
        range_vec[1].iov_len = ALLOC_SIZE;
        result_vec[1].iov_base = &ret[1];
        result_vec[1].iov_len = sizeof(long);
        rangep.vec = range_vec;
        rangep.size = sizeof(struct pr_madvise_param);

        pid = fork();
        if (!pid) {
                sleep(10);
        } else {
                int pidfd = open(cmd,  O_DIRECTORY | O_CLOEXEC);
                if (pidfd < 0)
                        return 1;

                /* munmap to make pages private for the child */
                munmap(addr[0], ALLOC_SIZE);
                munmap(addr[1], ALLOC_SIZE);
                system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
                if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
						&retp, &rangep, 0))
                        perror("process_madvise fail\n");
                system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
        }

        return 0;
}

Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx>
---
 include/uapi/asm-generic/mman-common.h |   5 +
 mm/madvise.c                           | 184 +++++++++++++++++++++----
 2 files changed, 166 insertions(+), 23 deletions(-)

diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index b9b51eeb8e1a..b8e230de84a6 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -74,4 +74,9 @@
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
 				 PKEY_DISABLE_WRITE)
 
+struct pr_madvise_param {
+	int size;			/* the size of this structure */
+	const struct iovec __user *vec;	/* address range array */
+};
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index af02aa17e5c1..f4f569dac2bd 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
 	struct page *page;
 	struct vm_area_struct *vma = walk->vma;
 	unsigned long next;
+	long nr_pages = 0;
 
 	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd)) {
@@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
 
 		ptep_test_and_clear_young(vma, addr, pte);
 		deactivate_page(page);
+		nr_pages++;
+
 	}
 
 	pte_unmap_unlock(orig_pte, ptl);
+	*(long *)walk->private += nr_pages;
 	cond_resched();
 
 	return 0;
@@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
 
 static void madvise_cool_page_range(struct mmu_gather *tlb,
 			     struct vm_area_struct *vma,
-			     unsigned long addr, unsigned long end)
+			     unsigned long addr, unsigned long end,
+			     long *nr_pages)
 {
 	struct mm_walk cool_walk = {
 		.pmd_entry = madvise_cool_pte_range,
 		.mm = vma->vm_mm,
+		.private = nr_pages
 	};
 
 	tlb_start_vma(tlb, vma);
@@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,
 }
 
 static long madvise_cool(struct vm_area_struct *vma,
-			unsigned long start_addr, unsigned long end_addr)
+			unsigned long start_addr, unsigned long end_addr,
+			long *nr_pages)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
@@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
-	madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
+	madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
 	tlb_finish_mmu(&tlb, start_addr, end_addr);
 
 	return 0;
@@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 	int isolated = 0;
 	struct vm_area_struct *vma = walk->vma;
 	unsigned long next;
+	long nr_pages = 0;
 
 	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd)) {
@@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 		list_add(&page->lru, &page_list);
 		if (isolated >= SWAP_CLUSTER_MAX) {
 			pte_unmap_unlock(orig_pte, ptl);
-			reclaim_pages(&page_list);
+			nr_pages += reclaim_pages(&page_list);
 			isolated = 0;
 			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 			orig_pte = pte;
@@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
 	}
 
 	pte_unmap_unlock(orig_pte, ptl);
-	reclaim_pages(&page_list);
+	nr_pages += reclaim_pages(&page_list);
 	cond_resched();
 
+	*(long *)walk->private += nr_pages;
 	return 0;
 }
 
 static void madvise_cold_page_range(struct mmu_gather *tlb,
 			     struct vm_area_struct *vma,
-			     unsigned long addr, unsigned long end)
+			     unsigned long addr, unsigned long end,
+			     long *nr_pages)
 {
 	struct mm_walk warm_walk = {
 		.pmd_entry = madvise_cold_pte_range,
 		.mm = vma->vm_mm,
+		.private = nr_pages,
 	};
 
 	tlb_start_vma(tlb, vma);
@@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
 
 
 static long madvise_cold(struct vm_area_struct *vma,
-			unsigned long start_addr, unsigned long end_addr)
+			unsigned long start_addr, unsigned long end_addr,
+			long *nr_pages)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather tlb;
@@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
-	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+	madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
 	tlb_finish_mmu(&tlb, start_addr, end_addr);
 
 	return 0;
@@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,
 static long
 madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
 		struct vm_area_struct **prev, unsigned long start,
-		unsigned long end, int behavior)
+		unsigned long end, int behavior, long *nr_pages)
 {
 	switch (behavior) {
 	case MADV_REMOVE:
@@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
 	case MADV_WILLNEED:
 		return madvise_willneed(vma, prev, start, end);
 	case MADV_COOL:
-		return madvise_cool(vma, start, end);
+		return madvise_cool(vma, start, end, nr_pages);
 	case MADV_COLD:
-		return madvise_cold(vma, start, end);
+		return madvise_cold(vma, start, end, nr_pages);
 	case MADV_FREE:
 	case MADV_DONTNEED:
 		return madvise_dontneed_free(tsk, vma, prev, start,
@@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)
 }
 
 static int madvise_core(struct task_struct *tsk, unsigned long start,
-			size_t len_in, int behavior)
+			size_t len_in, int behavior, long *nr_pages)
 {
 	unsigned long end, tmp;
 	struct vm_area_struct *vma, *prev;
@@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
 
 	if (start & ~PAGE_MASK)
 		return error;
+
 	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
 
 	/* Check to see whether len was rounded up from small -ve to zero */
@@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
 	blk_start_plug(&plug);
 	for (;;) {
 		/* Still start < end. */
+		long pages = 0;
+
 		error = -ENOMEM;
 		if (!vma)
 			goto out;
@@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
 			tmp = end;
 
 		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-		error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
+		error = madvise_vma(tsk, vma, &prev, start, tmp,
+					behavior, &pages);
 		if (error)
 			goto out;
+		*nr_pages += pages;
 		start = tmp;
 		if (prev && start < prev->vm_end)
 			start = prev->vm_end;
@@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
  */
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-	return madvise_core(current, start, len_in, behavior);
+	unsigned long dummy;
+
+	return madvise_core(current, start, len_in, behavior, &dummy);
 }
 
-SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
-		size_t, len_in, int, behavior)
+static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
+		struct pr_madvise_param *param)
+{
+	u32 size;
+	int ret;
+
+	memset(param, 0, sizeof(*param));
+
+	ret = get_user(size, &u_param->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+
+	if (!size || size > sizeof(struct pr_madvise_param))
+		return -EINVAL;
+
+	ret = copy_from_user(param, u_param, size);
+	if (ret)
+		return -EFAULT;
+
+	return ret;
+}
+
+static int process_madvise_core(struct task_struct *tsk, int *behaviors,
+				struct iov_iter *iter,
+				const struct iovec *range_vec,
+				unsigned long riovcnt,
+				unsigned long flags)
+{
+	int i;
+	long err;
+
+	for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
+		long ret = 0;
+
+		err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
+				range_vec[i].iov_len, behaviors[i],
+				&ret);
+		if (err)
+			ret = err;
+
+		if (copy_to_iter(&ret, sizeof(long), iter) !=
+				sizeof(long)) {
+			err = -EFAULT;
+			break;
+		}
+
+		err = 0;
+	}
+
+	return err;
+}
+
+SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
+			const int __user *, hints,
+			struct pr_madvise_param __user *, results,
+			struct pr_madvise_param __user *, ranges,
+			unsigned long, flags)
 {
 	int ret;
 	struct fd f;
 	struct pid *pid;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
+	struct pr_madvise_param result_p, range_p;
+	const struct iovec __user *result_vec, __user *range_vec;
+	int *behaviors;
+	struct iovec iovstack_result[UIO_FASTIOV];
+	struct iovec iovstack_r[UIO_FASTIOV];
+	struct iovec *iov_l = iovstack_result;
+	struct iovec *iov_r = iovstack_r;
+	struct iov_iter iter;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	ret = pr_madvise_copy_param(results, &result_p);
+	if (ret)
+		return ret;
+
+	ret = pr_madvise_copy_param(ranges, &range_p);
+	if (ret)
+		return ret;
+
+	result_vec = result_p.vec;
+	range_vec = range_p.vec;
+
+	if (result_p.size != sizeof(struct pr_madvise_param) ||
+			range_p.size != sizeof(struct pr_madvise_param))
+		return -EINVAL;
+
+	behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
+	if (!behaviors)
+		return -ENOMEM;
+
+	ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
+	if (ret < 0)
+		goto free_behavior_vec;
+
+	ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
+				&iov_l, &iter);
+	if (ret < 0)
+		goto free_behavior_vec;
+
+	if (!iov_iter_count(&iter)) {
+		ret = -EINVAL;
+		goto free_iovecs;
+	}
+
+	ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
+				UIO_FASTIOV, iovstack_r, &iov_r);
+	if (ret <= 0)
+		goto free_iovecs;
 
 	f = fdget(pidfd);
-	if (!f.file)
-		return -EBADF;
+	if (!f.file) {
+		ret = -EBADF;
+		goto free_iovecs;
+	}
 
 	pid = pidfd_to_pid(f.file);
 	if (IS_ERR(pid)) {
 		ret = PTR_ERR(pid);
-		goto err;
+		goto put_fd;
 	}
 
 	ret = -EINVAL;
@@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
 	tsk = pid_task(pid, PIDTYPE_PID);
 	if (!tsk) {
 		rcu_read_unlock();
-		goto err;
+		goto put_fd;
 	}
 	get_task_struct(tsk);
 	rcu_read_unlock();
@@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
 		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
 		if (ret == -EACCES)
 			ret = -EPERM;
-		goto err;
+		goto put_task;
 	}
-	ret = madvise_core(tsk, start, len_in, behavior);
+
+	ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
+					nr_elem, flags);
 	mmput(mm);
+put_task:
 	put_task_struct(tsk);
-err:
+put_fd:
 	fdput(f);
+free_iovecs:
+	if (iov_r != iovstack_r)
+		kfree(iov_r);
+	kfree(iov_l);
+free_behavior_vec:
+	kfree(behaviors);
+
 	return ret;
 }
-- 
2.21.0.1020.gf2820cf01a-goog




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux