[PATCH 1/2] mm/madvise: introduce PR_MADV_SELF flag to process_madvise()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



process_madvise() was conceived as a useful means for performing a vector
of madvise() operations on a remote process's address space.

However it's useful to be able to do so on the current process also. It is
currently rather clunky to do this (requiring a pidfd to be opened for the
current process) and introduces unnecessary overhead in incrementing
reference counts for the task and mm.

Avoid all of this by providing a PR_MADV_SELF flag, which causes
process_madvise() to simply ignore the pidfd parameter and instead apply
the operation to the current process.

Since we are operating on our own process, no restrictions need be applied
on behaviors we can perform, so do not limit these in that case.

Also extend the case of a user specifying the current process via pidfd to
not be restricted on behaviors which can be performed.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx>
---
 include/uapi/asm-generic/mman-common.h |  2 +
 mm/madvise.c                           | 58 +++++++++++++++++++-------
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
index 6ce1f1ceb432..8f59f23dee09 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -87,4 +87,6 @@
 #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
 				 PKEY_DISABLE_WRITE)
 
+#define PR_MADV_SELF	(1<<0)		/* process_madvise() flag - apply to self */
+
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */
diff --git a/mm/madvise.c b/mm/madvise.c
index ff139e57cca2..549b36d1463c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1208,7 +1208,8 @@ madvise_behavior_valid(int behavior)
 	}
 }
 
-static bool process_madvise_behavior_valid(int behavior)
+/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
+static bool process_madvise_remote_valid(int behavior)
 {
 	switch (behavior) {
 	case MADV_COLD:
@@ -1477,6 +1478,28 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 	return do_madvise(current->mm, start, len_in, behavior);
 }
 
+/* Perform an madvise operation over a vector of addresses and lengths. */
+static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
+			      int behavior)
+{
+	ssize_t ret = 0;
+	size_t total_len;
+
+	total_len = iov_iter_count(iter);
+
+	while (iov_iter_count(iter)) {
+		ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
+				 iter_iov_len(iter), behavior);
+		if (ret < 0)
+			break;
+		iov_iter_advance(iter, iter_iov_len(iter));
+	}
+
+	ret = (total_len - iov_iter_count(iter)) ? : ret;
+
+	return ret;
+}
+
 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 		size_t, vlen, int, behavior, unsigned int, flags)
 {
@@ -1486,10 +1509,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	struct iov_iter iter;
 	struct task_struct *task;
 	struct mm_struct *mm;
-	size_t total_len;
 	unsigned int f_flags;
 
-	if (flags != 0) {
+	if (flags & ~PR_MADV_SELF) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1498,13 +1520,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 	if (ret < 0)
 		goto out;
 
+	/*
+	 * Perform an madvise operation on the current process. No restrictions
+	 * need be applied, nor do we need to pin the task or mm_struct.
+	 */
+	if (flags & PR_MADV_SELF) {
+		ret = vector_madvise(current->mm, &iter, behavior);
+		goto free_iov;
+	}
+
 	task = pidfd_get_task(pidfd, &f_flags);
 	if (IS_ERR(task)) {
 		ret = PTR_ERR(task);
 		goto free_iov;
 	}
 
-	if (!process_madvise_behavior_valid(behavior)) {
+	/*
+	 * We need only perform this check if we are attempting to manipulate a
+	 * remote process's address space.
+	 */
+	if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
 		ret = -EINVAL;
 		goto release_task;
 	}
@@ -1518,24 +1553,15 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
 
 	/*
 	 * Require CAP_SYS_NICE for influencing process performance. Note that
-	 * only non-destructive hints are currently supported.
+	 * only non-destructive hints are currently supported for remote
+	 * processes.
 	 */
 	if (mm != current->mm && !capable(CAP_SYS_NICE)) {
 		ret = -EPERM;
 		goto release_mm;
 	}
 
-	total_len = iov_iter_count(&iter);
-
-	while (iov_iter_count(&iter)) {
-		ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
-					iter_iov_len(&iter), behavior);
-		if (ret < 0)
-			break;
-		iov_iter_advance(&iter, iter_iov_len(&iter));
-	}
-
-	ret = (total_len - iov_iter_count(&iter)) ? : ret;
+	ret = vector_madvise(mm, &iter, behavior);
 
 release_mm:
 	mmput(mm);
-- 
2.46.0





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux