This is the first advice that makes use of process_madvise() flags. Add the necessary plumbing to make the flags available from do_madvise() handlers. For MADV_COLLAPSE, the added flags are: * MADV_F_COLLAPSE_LIMITS - controls if we should respect khugepaged/max_ptes_* limits (requires CAP_SYS_ADMIN if not acting on self) * MADV_F_COLLAPSE_DEFRAG - force enable defrag, despite vma or system settings. These two flags together provide userspace flexibility in defining separate policies for synchronous userspace-directed collapse, and asynchronous kernel (khugepaged) collapse. Signed-off-by: Zach O'Keefe <zokeefe@xxxxxxxxxx> --- fs/io_uring.c | 3 +- include/linux/huge_mm.h | 3 +- include/linux/mm.h | 3 +- include/uapi/asm-generic/mman-common.h | 8 +++++ mm/khugepaged.c | 7 +++-- mm/madvise.c | 42 ++++++++++++++------------ 6 files changed, 41 insertions(+), 25 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 23e7f93d3956..8558b7549431 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4720,7 +4720,8 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice, + MADV_F_NONE); if (ret < 0) req_set_fail(req); io_req_complete(req, ret); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 407b63ab4185..31f514ff36be 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -228,7 +228,8 @@ int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + unsigned int flags); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); diff --git a/include/linux/mm.h b/include/linux/mm.h index dc69d2a69912..f4776f4cda48 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2690,7 +2690,8 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf, bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); -extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); +extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, + int behavior, unsigned int flags); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432..b81f4b1b18ba 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,14 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +/* process_madvise() flags */ +#define MADV_F_NONE 0x0 + +/* process_madvise(MADV_COLLAPSE) flags */ +#define MADV_F_COLLAPSE_LIMITS 0x1 /* respect system khugepaged/max_ptes_* sysfs limits */ +#define MADV_F_COLLAPSE_DEFRAG 0x2 /* force enable sync collapse + reclaim */ +#define MADV_F_COLLAPSE_MASK (MADV_F_COLLAPSE_LIMITS | MADV_F_COLLAPSE_DEFRAG) + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e8156f15a3da..993de0c6eaa9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2942,7 +2942,7 @@ static int _madvise_collapse(struct mm_struct *mm, int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end) + unsigned long end, unsigned int flags) { struct collapse_control cc; gfp_t gfp; @@ -2953,8 +2953,9 @@ int madvise_collapse(struct vm_area_struct *vma, mmap_assert_locked(mm); mmgrab(mm); - collapse_control_init(&cc, /* enforce_pte_scan_limits= */ false); - gfp = vma_thp_gfp_mask(vma); + collapse_control_init(&cc, flags & MADV_F_COLLAPSE_LIMITS); + gfp = vma_thp_gfp_mask(vma) | (flags & MADV_F_COLLAPSE_DEFRAG + ? __GFP_DIRECT_RECLAIM : 0); lru_add_drain(); /* lru_add_drain_all() too heavy here */ error = _madvise_collapse(mm, vma, prev, start, end, gfp, &cc); mmap_assert_locked(mm); diff --git a/mm/madvise.c b/mm/madvise.c index 292aa017c150..7d094d86d2f1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -979,7 +979,7 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + unsigned long behavior, unsigned int flags) { int error; struct anon_vma_name *anon_name; @@ -1048,7 +1048,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, goto out; break; case MADV_COLLAPSE: - return madvise_collapse(vma, prev, start, end); + return madvise_collapse(vma, prev, start, end, flags); } anon_name = anon_vma_name(vma); @@ -1160,13 +1160,19 @@ madvise_behavior_valid(int behavior) } static bool -process_madvise_behavior_valid(int behavior) +process_madvise_behavior_valid(int behavior, struct task_struct *task, + unsigned int flags) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: - return true; + return flags == 0; + case MADV_COLLAPSE: + return (flags & ~MADV_F_COLLAPSE_MASK) == 0 && + (capable(CAP_SYS_ADMIN) || + (task == current) || + (flags & MADV_F_COLLAPSE_LIMITS)); default: return false; } @@ -1182,10 +1188,11 @@ process_madvise_behavior_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, unsigned long arg, unsigned int flags, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, unsigned long arg, + unsigned int flags)) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1222,7 +1229,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = visit(vma, &prev, start, tmp, arg); + error = visit(vma, &prev, start, tmp, arg, flags); if (error) return error; start = tmp; @@ -1285,7 +1292,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, return 0; return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, - madvise_vma_anon_name); + madvise_vma_anon_name, MADV_F_NONE); } #endif /* CONFIG_ANON_VMA_NAME */ /* @@ -1359,7 +1366,8 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, + int behavior, unsigned int flags) { unsigned long end; int error; @@ -1401,8 +1409,8 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh } blk_start_plug(&plug); - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); + error = madvise_walk_vmas(mm, start, end, behavior, flags, + madvise_vma_behavior); blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); @@ -1414,7 +1422,8 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { - return do_madvise(current->mm, start, len_in, behavior); + return do_madvise(current->mm, start, len_in, behavior, + MADV_F_NONE); } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, @@ -1429,11 +1438,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t total_len; unsigned int f_flags; - if (flags != 0) { - ret = -EINVAL; - goto out; - } - ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; @@ -1444,7 +1448,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { + if (!process_madvise_behavior_valid(behavior, task, flags)) { ret = -EINVAL; goto release_task; } @@ -1470,7 +1474,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, while (iov_iter_count(&iter)) { iovec = iov_iter_iovec(&iter); ret = do_madvise(mm, (unsigned long)iovec.iov_base, - iovec.iov_len, behavior); + iovec.iov_len, behavior, flags); if (ret < 0) break; iov_iter_advance(&iter, iovec.iov_len); -- 2.35.1.616.g0bdcbb4464-goog