On Mon, 22 Jun 2020, Minchan Kim wrote: > diff --git a/mm/madvise.c b/mm/madvise.c > index 551ed816eefe..23abca3f93fa 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -17,6 +17,7 @@ > #include <linux/falloc.h> > #include <linux/fadvise.h> > #include <linux/sched.h> > +#include <linux/sched/mm.h> > #include <linux/ksm.h> > #include <linux/fs.h> > #include <linux/file.h> > @@ -995,6 +996,18 @@ madvise_behavior_valid(int behavior) > } > } > > +static bool > +process_madvise_behavior_valid(int behavior) > +{ > + switch (behavior) { > + case MADV_COLD: > + case MADV_PAGEOUT: > + return true; > + default: > + return false; > + } > +} > + > /* > * The madvise(2) system call. > * > @@ -1042,6 +1055,11 @@ madvise_behavior_valid(int behavior) > * MADV_DONTDUMP - the application wants to prevent pages in the given range > * from being included in its core dump. > * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. > + * MADV_COLD - the application is not expected to use this memory soon, > + * deactivate pages in this range so that they can be reclaimed > + * easily if memory pressure hanppens. > + * MADV_PAGEOUT - the application is not expected to use this memory soon, > + * page out the pages in this range immediately. > * > * return values: > * zero - success > @@ -1176,3 +1194,106 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) > { > return do_madvise(current, current->mm, start, len_in, behavior); > } > + > +static int process_madvise_vec(struct task_struct *target_task, > + struct mm_struct *mm, struct iov_iter *iter, int behavior) > +{ > + struct iovec iovec; > + int ret = 0; > + > + while (iov_iter_count(iter)) { > + iovec = iov_iter_iovec(iter); > + ret = do_madvise(target_task, mm, (unsigned long)iovec.iov_base, > + iovec.iov_len, behavior); > + if (ret < 0) > + break; > + iov_iter_advance(iter, iovec.iov_len); > + } > + > + return ret; > +} > + > +static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter, > + int behavior, unsigned int flags) > +{ > + ssize_t ret; > + struct pid *pid; > + struct task_struct *task; > + struct mm_struct *mm; > + size_t total_len = iov_iter_count(iter); > + > + if (flags != 0) > + return -EINVAL; > + > + pid = pidfd_get_pid(pidfd); > + if (IS_ERR(pid)) > + return PTR_ERR(pid); > + > + task = get_pid_task(pid, PIDTYPE_PID); > + if (!task) { > + ret = -ESRCH; > + goto put_pid; > + } > + > + if (task->mm != current->mm && > + !process_madvise_behavior_valid(behavior)) { > + ret = -EINVAL; > + goto release_task; > + } > + > + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); > + if (IS_ERR_OR_NULL(mm)) { > + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > + goto release_task; > + } > mm is always task->mm right? I'm wondering if it would be better to find the mm directly in process_madvise_vec() rather than passing it into the function. I'm not sure why we'd pass both task and mm here. + > + ret = process_madvise_vec(task, mm, iter, behavior); > + if (ret >= 0) > + ret = total_len - iov_iter_count(iter); > + > + mmput(mm); > +release_task: > + put_task_struct(task); > +put_pid: > + put_pid(pid); > + return ret; > +} > + > +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, > + unsigned long, vlen, int, behavior, unsigned int, flags) I love the idea of adding the flags parameter here and I can think of an immediate use case for MADV_HUGEPAGE, which is overloaded. Today, MADV_HUGEPAGE controls enablement depending on system config and controls defrag behavior based on system config. It also cannot be opted out of without setting MADV_NOHUGEPAGE :) I was thinking of a flag that users could use to trigger an immediate collapse in process context regardless of the system config. So I'm a big advocate of this flags parameter and consider it an absolute must for the API. Acked-by: David Rientjes <rientjes@xxxxxxxxxx>