On Fri, Feb 28, 2020 at 02:14:56PM -0800, Suren Baghdasaryan wrote: < snip > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > index f75c86b6c463..f29155b8185d 100644 > > > --- a/mm/madvise.c > > > +++ b/mm/madvise.c > > > @@ -17,6 +17,7 @@ > > > #include <linux/falloc.h> > > > #include <linux/fadvise.h> > > > #include <linux/sched.h> > > > +#include <linux/sched/mm.h> > > > #include <linux/ksm.h> > > > #include <linux/fs.h> > > > #include <linux/file.h> > > > @@ -986,6 +987,18 @@ madvise_behavior_valid(int behavior) > > > } > > > } > > > > > > +static bool > > > +process_madvise_behavior_valid(int behavior) > > > +{ > > > + switch (behavior) { > > > + case MADV_COLD: > > > + case MADV_PAGEOUT: > > > + return true; > > > + default: > > > + return false; > > > + } > > > +} > > > + > > > /*/ > > > * The madvise(2) system call. > > > * > > > @@ -1033,6 +1046,11 @@ madvise_behavior_valid(int behavior) > > > * MADV_DONTDUMP - the application wants to prevent pages in the given range > > > * from being included in its core dump. > > > * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. > > > + * MADV_COLD - the application uses the memory less so the kernel can > > "kernel can" implies that kernel might not deactivate the pages, which > IIUC is not the case. Maybe rephrase as "MADV_COLD - the application > is not expected to use this memory soon, deactivate pages in this > range so that they can be reclaimed easily if memory pressure > happens."" That is much better. > > > > + * deactivate the memory to evict them quickly when the memory > > > + * pressure happen. > > > + * MADV_PAGEOUT - the application uses the memroy very rarely so kernel can > > s/memroy/memory Fixed. > > > > + * page out the memory instantly. > > same nit about the usage of "kernel can". Maybe rephrase as > "MADV_PAGEOUT - the application is not expected to use this memory > soon, page out the pages in this range immediately."" Yub. > > > > * > > > * return values: > > > * zero - success > > > @@ -1150,3 +1168,49 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) > > > { > > > return do_madvise(current, current->mm, start, len_in, behavior); > > > } > > > + > > > +SYSCALL_DEFINE5(process_madvise, int, pidfd, unsigned long, start, > > > + size_t, len_in, int, behavior, unsigned long, flags) > > > +{ > > > + int ret; > > > + struct fd f; > > > + struct pid *pid; > > > + struct task_struct *task; > > > + struct mm_struct *mm; > > > + > > > + if (flags != 0) > > > + return -EINVAL; > > > + > > > + if (!process_madvise_behavior_valid(behavior)) > > > + return -EINVAL; > > > + > > > + f = fdget(pidfd); > > > + if (!f.file) > > > + return -EBADF; > > > + > > > + pid = pidfd_pid(f.file); > > > + if (IS_ERR(pid)) { > > > + ret = PTR_ERR(pid); > > > + goto fdput; > > > + } > > > + > > > + task = get_pid_task(pid, PIDTYPE_PID); > > > + if (!task) { > > > + ret = -ESRCH; > > > + goto fdput; > > > + } > > > + > > > + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); > > > + if (IS_ERR_OR_NULL(mm)) { > > > + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > > > + goto release_task; > > > + } > > > + > > > + ret = do_madvise(task, mm, start, len_in, behavior); > > > + mmput(mm); > > > +release_task: > > > + put_task_struct(task); > > > +fdput: > > > + fdput(f); > > > + return ret; > > > +} > > > -- > > > 2.25.0.265.gbab2e86ba0-goog > > > > > > > Reviewed-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> Thanks, Suren!