The process_madvise() call was introduced in commit ecb8ac8b1f14 ("mm/madvise: introduce process_madvise() syscall: an external memory hinting API") as a means of performing madvise() operations on another process. However, as it provides the means by which to perform multiple madvise() operations in a batch via an iovec, it is useful to utilise the same interface for performing operations on the current process rather than a remote one. Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check if same mm") removed the need for a caller invoking process_madvise() on its own pidfd to possess the CAP_SYS_NICE capability, however this leaves the restrictions on operation in place. Resolve this by only applying the restriction on operations when accessing a remote process. Moving forward we plan to implement a simpler means of specifying this condition other than needing to establish a self pidfd, perhaps in the form of a sentinel pidfd. Also take the opportunity to refactor the system call implementation abstracting the vectorised operation. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> --- v3: * Avoid introducing PR_MADV_SELF and defer a non-pidfd version until later. v2: * Fix silly mistake referencing unassigned mm variable. * Add PR_MADV_SELF to architecture-specific mman headers. https://lore.kernel.org/linux-mm/9cf0e96d-287f-4749-ba85-8414c06aa54c@lucifer.local/ v1: https://lore.kernel.org/all/cover.1727106751.git.lorenzo.stoakes@xxxxxxxxxx/ mm/madvise.c | 55 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 50d223ab3894..e871a72a6c32 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1208,7 +1208,8 @@ madvise_behavior_valid(int behavior) } } -static bool process_madvise_behavior_valid(int behavior) +/* Can we invoke process_madvise() on a remote mm for the specified behavior? */ +static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: @@ -1477,6 +1478,28 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return do_madvise(current->mm, start, len_in, behavior); } +/* Perform an madvise operation over a vector of addresses and lengths. */ +static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, + int behavior) +{ + ssize_t ret = 0; + size_t total_len; + + total_len = iov_iter_count(iter); + + while (iov_iter_count(iter)) { + ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), + iter_iov_len(iter), behavior); + if (ret < 0) + break; + iov_iter_advance(iter, iter_iov_len(iter)); + } + + ret = (total_len - iov_iter_count(iter)) ? : ret; + + return ret; +} + SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { @@ -1486,7 +1509,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; - size_t total_len; unsigned int f_flags; if (flags != 0) { @@ -1504,11 +1526,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { - ret = -EINVAL; - goto release_task; - } - /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { @@ -1516,26 +1533,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } + /* + * We need only perform this check if we are attempting to manipulate a + * remote process's address space. + */ + if (mm != current->mm && !process_madvise_remote_valid(behavior)) { + ret = -EINVAL; + goto release_mm; + } + /* * Require CAP_SYS_NICE for influencing process performance. Note that - * only non-destructive hints are currently supported. + * only non-destructive hints are currently supported for remote + * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } - total_len = iov_iter_count(&iter); - - while (iov_iter_count(&iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), - iter_iov_len(&iter), behavior); - if (ret < 0) - break; - iov_iter_advance(&iter, iter_iov_len(&iter)); - } - - ret = (total_len - iov_iter_count(&iter)) ? : ret; + ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); -- 2.46.0