From: Nadav Amit <namit@xxxxxxxxxx> There are certain operations that can be performed only once on process_madvise() instead of performing them for each IO vector. Acquiring the mmap-lock, and initializing blk_plug are specifically such operations. Collect the aforementioned operations into madvise_start() and madvise_finish(). The next patches will add additional operations into these functions. Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Minchan Kim <minchan@xxxxxxxxxx> Cc: Colin Cross <ccross@xxxxxxxxxx> Cc: Suren Baghdasarya <surenb@xxxxxxxxxx> Cc: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx> --- mm/madvise.c | 139 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 86 insertions(+), 53 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 127507c71ba9..84b86ae85671 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -43,6 +43,13 @@ struct madvise_walk_private { struct madvise_info { u8 behavior_valid: 1; u8 process_behavior_valid: 1; + u8 no_mmap_lock: 1; + + /* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_lock for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ u8 need_mmap_read_only: 1; }; @@ -120,9 +127,11 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = { #ifdef CONFIG_MEMORY_FAILURE [MADV_HWPOISON] = { .behavior_valid = 1, + .no_mmap_lock = 1, }, [MADV_SOFT_OFFLINE] = { .behavior_valid = 1, + .no_mmap_lock = 1, }, #endif [MADV_POPULATE_READ] = { @@ -135,16 +144,6 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = { }, }; -/* - * Any behaviour which results in changes to the vma->vm_flags needs to - * take mmap_lock for writing. Others, which simply traverse vmas, need - * to only take it for reading. - */ -static int madvise_need_mmap_write(int behavior) -{ - return !madvise_info[behavior].need_mmap_read_only; -} - /* * We can potentially split a vm area into separate * areas, each area with its own behavior. @@ -1081,26 +1080,6 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, } } -static bool -madvise_behavior_valid(int *behavior) -{ - if (*behavior >= ARRAY_SIZE(madvise_info)) - return false; - - *behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info)); - return madvise_info[*behavior].behavior_valid; -} - -static bool -process_madvise_behavior_valid(int *behavior) -{ - if (*behavior >= ARRAY_SIZE(madvise_info)) - return false; - - *behavior = array_index_nospec(*behavior, ARRAY_SIZE(madvise_info)); - return madvise_info[*behavior].process_behavior_valid; -} - /* * The madvise(2) system call. * @@ -1171,21 +1150,17 @@ process_madvise_behavior_valid(int *behavior) * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) +int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in, + int behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; int unmapped_error = 0; int error = -EINVAL; - int write; size_t len; - struct blk_plug plug; start = untagged_addr(start); - if (!madvise_behavior_valid(&behavior)) - return error; - if (!PAGE_ALIGNED(start)) return error; len = PAGE_ALIGN(len_in); @@ -1207,14 +1182,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh return madvise_inject_error(behavior, start, start + len_in); #endif - write = madvise_need_mmap_write(behavior); - if (write) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); - } - /* * If the interval [start,end) covers some unmapped address * ranges, just ignore them, but return -ENOMEM at the end. @@ -1224,7 +1191,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh if (vma && start > vma->vm_start) prev = vma; - blk_start_plug(&plug); for (;;) { /* Still start < end. */ error = -ENOMEM; @@ -1260,15 +1226,72 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh vma = find_vma(mm, start); } out: - blk_finish_plug(&plug); - if (write) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); return error; } +static int +madvise_start(struct mm_struct *mm, struct madvise_info behavior_info, + struct blk_plug *plug) +{ + if (!behavior_info.no_mmap_lock) { + if (behavior_info.need_mmap_read_only) + mmap_read_lock(mm); + else if (mmap_write_lock_killable(mm)) + return -EINTR; + } + + blk_start_plug(plug); + return 0; +} + +static void +madvise_finish(struct mm_struct *mm, struct madvise_info behavior_info, + struct blk_plug *plug) +{ + blk_finish_plug(plug); + + if (!behavior_info.no_mmap_lock) { + if (behavior_info.need_mmap_read_only) + mmap_read_unlock(mm); + else + mmap_write_unlock(mm); + } +} + +static struct madvise_info madvise_behavior_info(int behavior) +{ + if (behavior >= ARRAY_SIZE(madvise_info) || behavior < 0) { + const struct madvise_info invalid = {0}; + return invalid; + } + + behavior = array_index_nospec(behavior, ARRAY_SIZE(madvise_info)); + return madvise_info[behavior]; +} + +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, + int behavior) +{ + struct madvise_info behavior_info; + struct blk_plug plug; + int ret = -EINVAL; + + behavior_info = madvise_behavior_info(behavior); + + if (!behavior_info.behavior_valid) + return ret; + + ret = madvise_start(mm, behavior_info, &plug); + if (ret != 0) + return ret; + + ret = madvise_one_range(mm, start, len_in, behavior); + + madvise_finish(mm, behavior_info, &plug); + return ret; +} + SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); @@ -1286,6 +1309,8 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, struct mm_struct *mm; size_t total_len; unsigned int f_flags; + struct madvise_info behavior_info; + struct blk_plug plug; if (flags != 0) { ret = -EINVAL; @@ -1308,7 +1333,9 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto put_pid; } - if (!process_madvise_behavior_valid(&behavior)) { + behavior_info = madvise_behavior_info(behavior); + + if (!behavior_info.process_behavior_valid) { ret = -EINVAL; goto release_task; } @@ -1331,15 +1358,21 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, total_len = iov_iter_count(&iter); + ret = madvise_start(mm, behavior_info, &plug); + if (ret != 0) + goto release_mm; + while (iov_iter_count(&iter)) { iovec = iov_iter_iovec(&iter); - ret = do_madvise(mm, (unsigned long)iovec.iov_base, - iovec.iov_len, behavior); + ret = madvise_one_range(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); if (ret < 0) break; iov_iter_advance(&iter, iovec.iov_len); } + madvise_finish(mm, behavior_info, &plug); + if (ret == 0) ret = total_len - iov_iter_count(&iter); -- 2.25.1