process_madvise() was conceived as a useful means for performing a vector of madvise() operations on a remote process's address space. However it's useful to be able to do so on the current process also. It is currently rather clunky to do this (requiring a pidfd to be opened for the current process) and introduces unnecessary overhead in incrementing reference counts for the task and mm. Avoid all of this by providing a PR_MADV_SELF flag, which causes process_madvise() to simply ignore the pidfd parameter and instead apply the operation to the current process. Since we are operating on our own process, no restrictions need be applied on behaviors we can perform, so do not limit these in that case. Also extend the case of a user specifying the current process via pidfd to not be restricted on behaviors which can be performed. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> --- arch/alpha/include/uapi/asm/mman.h | 2 + arch/mips/include/uapi/asm/mman.h | 2 + arch/parisc/include/uapi/asm/mman.h | 2 + arch/xtensa/include/uapi/asm/mman.h | 2 + include/uapi/asm-generic/mman-common.h | 2 + mm/madvise.c | 66 ++++++++++++++++++-------- 6 files changed, 56 insertions(+), 20 deletions(-) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 763929e814e9..0148e6de35ab 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -86,4 +86,6 @@ #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ PKEY_DISABLE_WRITE) +#define PR_MADV_SELF (1<<0) /* process_madvise() flag - apply to self */ + #endif /* __ALPHA_MMAN_H__ */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 9c48d9a21aa0..acb4c3bc92b2 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -113,4 +113,6 @@ #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ PKEY_DISABLE_WRITE) +#define PR_MADV_SELF (1<<0) /* process_madvise() flag - apply to self */ + #endif /* _ASM_MMAN_H */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 68c44f99bc93..0f839b2cad13 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -83,4 +83,6 @@ #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ PKEY_DISABLE_WRITE) +#define PR_MADV_SELF (1<<0) /* process_madvise() flag - apply to self */ + #endif /* __PARISC_MMAN_H__ */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 1ff0c858544f..37dd27d09251 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -121,4 +121,6 @@ #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ PKEY_DISABLE_WRITE) +#define PR_MADV_SELF (1<<0) /* process_madvise() flag - apply to self */ + #endif /* _XTENSA_MMAN_H */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432..8f59f23dee09 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -87,4 +87,6 @@ #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ PKEY_DISABLE_WRITE) +#define PR_MADV_SELF (1<<0) /* process_madvise() flag - apply to self */ + #endif /* __ASM_GENERIC_MMAN_COMMON_H */ diff --git a/mm/madvise.c b/mm/madvise.c index ff139e57cca2..49d12f98b677 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1208,7 +1208,8 @@ madvise_behavior_valid(int behavior) } } -static bool process_madvise_behavior_valid(int behavior) +/* Can we invoke process_madvise() on a remote mm for the specified behavior? */ +static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: @@ -1477,6 +1478,28 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return do_madvise(current->mm, start, len_in, behavior); } +/* Perform an madvise operation over a vector of addresses and lengths. */ +static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, + int behavior) +{ + ssize_t ret = 0; + size_t total_len; + + total_len = iov_iter_count(iter); + + while (iov_iter_count(iter)) { + ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), + iter_iov_len(iter), behavior); + if (ret < 0) + break; + iov_iter_advance(iter, iter_iov_len(iter)); + } + + ret = (total_len - iov_iter_count(iter)) ? : ret; + + return ret; +} + SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { @@ -1486,10 +1509,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; - size_t total_len; unsigned int f_flags; - if (flags != 0) { + if (flags & ~PR_MADV_SELF) { ret = -EINVAL; goto out; } @@ -1498,17 +1520,21 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, if (ret < 0) goto out; + /* + * Perform an madvise operation on the current process. No restrictions + * need be applied, nor do we need to pin the task or mm_struct. + */ + if (flags & PR_MADV_SELF) { + ret = vector_madvise(current->mm, &iter, behavior); + goto free_iov; + } + task = pidfd_get_task(pidfd, &f_flags); if (IS_ERR(task)) { ret = PTR_ERR(task); goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { - ret = -EINVAL; - goto release_task; - } - /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) { @@ -1516,26 +1542,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } + /* + * We need only perform this check if we are attempting to manipulate a + * remote process's address space. + */ + if (mm != current->mm && !process_madvise_remote_valid(behavior)) { + ret = -EINVAL; + goto release_mm; + } + /* * Require CAP_SYS_NICE for influencing process performance. Note that - * only non-destructive hints are currently supported. + * only non-destructive hints are currently supported for remote + * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } - total_len = iov_iter_count(&iter); - - while (iov_iter_count(&iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), - iter_iov_len(&iter), behavior); - if (ret < 0) - break; - iov_iter_advance(&iter, iter_iov_len(&iter)); - } - - ret = (total_len - iov_iter_count(&iter)) ? : ret; + ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); -- 2.46.0