On 9/21/20 7:55 PM, Minchan Kim wrote: > On Mon, Sep 21, 2020 at 07:56:33AM +0100, Christoph Hellwig wrote: >> On Mon, Aug 31, 2020 at 05:06:33PM -0700, Minchan Kim wrote: >>> There is usecase that System Management Software(SMS) want to give a >>> memory hint like MADV_[COLD|PAGEEOUT] to other processes and in the >>> case of Android, it is the ActivityManagerService. >>> >>> The information required to make the reclaim decision is not known to >>> the app. Instead, it is known to the centralized userspace >>> daemon(ActivityManagerService), and that daemon must be able to >>> initiate reclaim on its own without any app involvement. >>> >>> To solve the issue, this patch introduces a new syscall process_madvise(2). >>> It uses pidfd of an external process to give the hint. It also supports >>> vector address range because Android app has thousands of vmas due to >>> zygote so it's totally waste of CPU and power if we should call the >>> syscall one by one for each vma.(With testing 2000-vma syscall vs >>> 1-vector syscall, it showed 15% performance improvement. I think it >>> would be bigger in real practice because the testing ran very cache >>> friendly environment). >> >> I'm really not sure this syscall is a good idea. If you want central >> control you should implement an IPC mechanisms that allows your >> supervisor daemon to tell the application to perform the madvice >> instead of forcing the behavior on it. > > There was dicussion about the approach. There were several issues. > One of them was the target app was already freezed and we wanted > to run the syscall in caller's context, not callee. > >> >>> /* >>> * The madvise(2) system call. >>> * >>> @@ -1036,6 +1049,11 @@ madvise_behavior_valid(int behavior) >>> * MADV_DONTDUMP - the application wants to prevent pages in the given range >>> * from being included in its core dump. >>> * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. >>> + * MADV_COLD - the application is not expected to use this memory soon, >>> + * deactivate pages in this range so that they can be reclaimed >>> + * easily if memory pressure hanppens. >>> + * MADV_PAGEOUT - the application is not expected to use this memory soon, >>> + * page out the pages in this range immediately. >> >> This should really go into a separate patch, as it has nothing to do >> with the new syscall. > > Technically, right but I expected it's not worth to have separate patch. > >> >>> +static int process_madvise_vec(struct mm_struct *mm, struct iov_iter *iter, int behavior) >>> +{ >>> + struct iovec iovec; >>> + int ret = 0; >>> + >>> + while (iov_iter_count(iter)) { >>> + iovec = iov_iter_iovec(iter); >>> + ret = do_madvise(mm, (unsigned long)iovec.iov_base, iovec.iov_len, behavior); >>> + if (ret < 0) >>> + break; >>> + iov_iter_advance(iter, iovec.iov_len); >>> + } >>> + >>> + return ret; >> >> Please avoid the entirely pointless overly long line. >> >>> +static inline int madv_import_iovec(int type, const struct iovec __user *uvec, unsigned int nr_segs, >>> + unsigned int fast_segs, struct iovec **iov, struct iov_iter *i) >>> +{ >>> +#ifdef CONFIG_COMPAT >>> + if (in_compat_syscall()) >>> + return compat_import_iovec(type, (struct compat_iovec __user *)uvec, nr_segs, >>> + fast_segs, iov, i); >>> +#endif >> >> More of the same. >> >>> +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, >>> + unsigned long, vlen, int, behavior, unsigned int, flags) >>> +{ >>> + ssize_t ret; >>> + struct iovec iovstack[UIO_FASTIOV]; >>> + struct iovec *iov = iovstack; >>> + struct iov_iter iter; >>> + >>> + ret = madv_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); >>> + if (ret < 0) >>> + return ret; >>> + >>> + ret = do_process_madvise(pidfd, &iter, behavior, flags); >>> + kfree(iov); >>> + return ret; >> >> Even more here. But more importantly there seems to be absolutely >> no reason for the madv_import_iovec and do_process_madvise helpers >> that both are tiny and have this even smaller function as the only >> caller. > > Fair enough. > > > Andrew, could you fold this patch? > Thank you. > > From 02d63c6b3f61a1085f4eab80f5171bd2627b5ab0 Mon Sep 17 00:00:00 2001 > From: Minchan Kim <minchan@xxxxxxxxxx> > Date: Mon, 21 Sep 2020 09:31:25 -0700 > Subject: [PATCH] mm: do not use helper functions for process_madvise > > This patch removes helper functions process_madvise_vec, > do_process_madvise and madv_import_iovec and use them inline. > > Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> > --- > mm/madvise.c | 97 +++++++++++++++++++++++----------------------------- > 1 file changed, 43 insertions(+), 54 deletions(-) > > diff --git a/mm/madvise.c b/mm/madvise.c > index ae266dfede8a..aa8bc65dbdb6 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -1166,37 +1166,40 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) > return do_madvise(current->mm, start, len_in, behavior); > } > > -static int process_madvise_vec(struct mm_struct *mm, struct iov_iter *iter, int behavior) > -{ > - struct iovec iovec; > - int ret = 0; > - > - while (iov_iter_count(iter)) { > - iovec = iov_iter_iovec(iter); > - ret = do_madvise(mm, (unsigned long)iovec.iov_base, iovec.iov_len, behavior); > - if (ret < 0) > - break; > - iov_iter_advance(iter, iovec.iov_len); > - } > - > - return ret; > -} > - > -static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter, > - int behavior, unsigned int flags) > +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, > + size_t, vlen, int, behavior, unsigned int, flags) > { > ssize_t ret; > + struct iovec iovstack[UIO_FASTIOV], iovec; > + struct iovec *iov = iovstack; > + struct iov_iter iter; > struct pid *pid; > struct task_struct *task; > struct mm_struct *mm; > - size_t total_len = iov_iter_count(iter); > + size_t total_len; > > - if (flags != 0) > - return -EINVAL; > + if (flags != 0) { > + ret = -EINVAL; > + goto out; > + } > + > +#ifdef CONFIG_COMPAT > + if (in_compat_syscall()) > + ret = compat_import_iovec(READ, > + (struct compat_iovec __user *)vec, vlen, > + ARRAY_SIZE(iovstack), &iov, &iter); > + else > +#endif > + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), > + &iov, &iter); > + if (ret < 0) > + goto out; > > pid = pidfd_get_pid(pidfd); > - if (IS_ERR(pid)) > - return PTR_ERR(pid); > + if (IS_ERR(pid)) { > + ret = PTR_ERR(pid); > + goto free_iov; > + } > > task = get_pid_task(pid, PIDTYPE_PID); > if (!task) { > @@ -1216,43 +1219,29 @@ static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter, > goto release_task; > } > > - ret = process_madvise_vec(mm, iter, behavior); > - if (ret >= 0) > - ret = total_len - iov_iter_count(iter); > + total_len = iov_iter_count(&iter); > + > + while (iov_iter_count(&iter)) { > + iovec = iov_iter_iovec(&iter); > + ret = do_madvise(mm, (unsigned long)iovec.iov_base, > + iovec.iov_len, behavior); > + if (ret < 0) > + break; > + iov_iter_advance(&iter, iovec.iov_len); > + } > + > + if (ret == 0) > + ret = total_len - iov_iter_count(&iter); > > mmput(mm); > + return ret; This "return ret;" seems quite wrong... I will send the following : diff --git a/mm/madvise.c b/mm/madvise.c index 416a56b8e757bf3465ab13cea51e0751ade2c745..cc9224a59e9fa07e41f9b4ad2e58b9c97889299b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1231,7 +1231,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, ret = total_len - iov_iter_count(&iter); mmput(mm); - return ret; release_task: put_task_struct(task); > + > release_task: > put_task_struct(task); > put_pid: > put_pid(pid); > - return ret; > -} > - > -static inline int madv_import_iovec(int type, const struct iovec __user *uvec, size_t nr_segs, > - unsigned int fast_segs, struct iovec **iov, struct iov_iter *i) > -{ > -#ifdef CONFIG_COMPAT > - if (in_compat_syscall()) > - return compat_import_iovec(type, (struct compat_iovec __user *)uvec, nr_segs, > - fast_segs, iov, i); > -#endif > - > - return import_iovec(type, uvec, nr_segs, fast_segs, iov, i); > -} > - > -SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, > - size_t, vlen, int, behavior, unsigned int, flags) > -{ > - ssize_t ret; > - struct iovec iovstack[UIO_FASTIOV]; > - struct iovec *iov = iovstack; > - struct iov_iter iter; > - > - ret = madv_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); > - if (ret < 0) > - return ret; > - > - ret = do_process_madvise(pidfd, &iter, behavior, flags); > +free_iov: > kfree(iov); > +out: > return ret; > } >