From: Nadav Amit <namit@xxxxxxxxxx> process_madvise(MADV_COLD) and future variants lose the opportunity to perform TLB batching across IO vectors. As a result, a TLB flush (and potentially a shootdown) is performed for each IO vector, instead of one for the whole operation. Batch TLB operations across IO-vectors. Open code zap_page_range() to do so. A future patch will eliminate redundancies between madvise_free_single_vma() and madvise_dontneed_single_vma(). Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Minchan Kim <minchan@xxxxxxxxxx> Cc: Colin Cross <ccross@xxxxxxxxxx> Cc: Suren Baghdasarya <surenb@xxxxxxxxxx> Cc: Mike Rapoport <rppt@xxxxxxxxxxxxxxxxxx> Signed-off-by: Nadav Amit <namit@xxxxxxxxxx> --- mm/internal.h | 5 +++ mm/madvise.c | 104 +++++++++++++++++++++++++++++--------------------- mm/memory.c | 2 +- 3 files changed, 67 insertions(+), 44 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 18256e32a14c..2313a6739ce7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -59,6 +59,11 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); +void unmap_single_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, + struct zap_details *details); + void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_size); void force_page_cache_ra(struct readahead_control *, unsigned long nr); diff --git a/mm/madvise.c b/mm/madvise.c index 84b86ae85671..e679cfa94655 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -51,6 +51,7 @@ struct madvise_info { * to only take it for reading. */ u8 need_mmap_read_only: 1; + u8 tlb_batching: 1; }; static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = { @@ -81,20 +82,24 @@ static const struct madvise_info madvise_info[MADV_SOFT_OFFLINE+1] = { [MADV_DONTNEED] = { .behavior_valid = 1, .need_mmap_read_only = 1, + .tlb_batching = 1, }, [MADV_FREE] = { .behavior_valid = 1, .need_mmap_read_only = 1, + .tlb_batching = 1, }, [MADV_COLD] = { .behavior_valid = 1, .process_behavior_valid = 1, .need_mmap_read_only = 1, + .tlb_batching = 1, }, [MADV_PAGEOUT] = { .behavior_valid = 1, .process_behavior_valid = 1, .need_mmap_read_only = 1, + .tlb_batching = 1, }, #ifdef CONFIG_KSM [MADV_MERGEABLE] = { @@ -578,21 +583,16 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } -static long madvise_cold(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start_addr, unsigned long end_addr) +static long madvise_cold(struct mmu_gather *tlb, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start_addr, + unsigned long end_addr) { - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather tlb; - *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - madvise_cold_page_range(&tlb, vma, start_addr, end_addr); - tlb_finish_mmu(&tlb); + madvise_cold_page_range(tlb, vma, start_addr, end_addr); return 0; } @@ -628,13 +628,10 @@ static inline bool can_do_pageout(struct vm_area_struct *vma) file_permission(vma->vm_file, MAY_WRITE) == 0; } -static long madvise_pageout(struct vm_area_struct *vma, +static long madvise_pageout(struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) { - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather tlb; - *prev = vma; if (!can_madv_lru_vma(vma)) return -EINVAL; @@ -643,9 +640,7 @@ static long madvise_pageout(struct vm_area_struct *vma, return 0; lru_add_drain(); - tlb_gather_mmu(&tlb, mm); - madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); - tlb_finish_mmu(&tlb); + madvise_pageout_page_range(tlb, vma, start_addr, end_addr); return 0; } @@ -787,12 +782,12 @@ static const struct mm_walk_ops madvise_free_walk_ops = { .pmd_entry = madvise_free_pte_range, }; -static int madvise_free_single_vma(struct vm_area_struct *vma, +static int madvise_free_single_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - struct mmu_gather tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -802,16 +797,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, start_addr, end_addr); lru_add_drain(); - tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); tlb_start_vma(&tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, &tlb); - tlb_end_vma(&tlb, vma); + &madvise_free_walk_ops, tlb); + tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); return 0; } @@ -820,7 +813,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for shrink_active_list to actually free + * unmap_single_vma call sets things up for shrink_active_list to actually free * these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. @@ -835,14 +828,28 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct vm_area_struct *vma, +static long madvise_dontneed_single_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range(vma, start, end - start); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, + end); + + lru_add_drain(); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(&range); + unmap_single_vma(tlb, vma, start, end, NULL); + mmu_notifier_invalidate_range_end(&range); + return 0; } -static long madvise_dontneed_free(struct vm_area_struct *vma, +static long madvise_dontneed_free(struct mmu_gather *tlb, + struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long *pend, int behavior) @@ -895,9 +902,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, } if (behavior == MADV_DONTNEED) - return madvise_dontneed_single_vma(vma, start, end); + return madvise_dontneed_single_vma(tlb, vma, start, end); else /* behavior == MADV_FREE */ - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(tlb, vma, start, end); } static long madvise_populate(struct vm_area_struct *vma, @@ -1055,8 +1062,9 @@ static int madvise_inject_error(int behavior, #endif static long -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long *pend, int behavior) +madvise_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long *pend, int behavior) { unsigned long end = *pend; @@ -1066,12 +1074,13 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_COLD: - return madvise_cold(vma, prev, start, end); + return madvise_cold(tlb, vma, prev, start, end); case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); + return madvise_pageout(tlb, vma, prev, start, end); case MADV_FREE: case MADV_DONTNEED: - return madvise_dontneed_free(vma, prev, start, pend, behavior); + return madvise_dontneed_free(tlb, vma, prev, start, pend, + behavior); case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: return madvise_populate(vma, prev, start, end, behavior); @@ -1151,7 +1160,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * -EAGAIN - a kernel resource was temporarily unavailable. */ int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in, - int behavior) + int behavior, struct mmu_gather *tlb) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -1211,7 +1220,7 @@ int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in, tmp = end; /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(vma, &prev, start, &tmp, behavior); + error = madvise_vma(tlb, vma, &prev, start, &tmp, behavior); if (error) goto out; start = tmp; @@ -1225,6 +1234,7 @@ int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in, else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } + out: return error; @@ -1232,7 +1242,7 @@ int madvise_one_range(struct mm_struct *mm, unsigned long start, size_t len_in, static int madvise_start(struct mm_struct *mm, struct madvise_info behavior_info, - struct blk_plug *plug) + struct mmu_gather *tlb, struct blk_plug *plug) { if (!behavior_info.no_mmap_lock) { if (behavior_info.need_mmap_read_only) @@ -1241,16 +1251,22 @@ madvise_start(struct mm_struct *mm, struct madvise_info behavior_info, return -EINTR; } + if (behavior_info.tlb_batching) + tlb_gather_mmu(tlb, mm); + blk_start_plug(plug); return 0; } static void madvise_finish(struct mm_struct *mm, struct madvise_info behavior_info, - struct blk_plug *plug) + struct mmu_gather *tlb, struct blk_plug *plug) { blk_finish_plug(plug); + if (behavior_info.tlb_batching) + tlb_finish_mmu(tlb); + if (!behavior_info.no_mmap_lock) { if (behavior_info.need_mmap_read_only) mmap_read_unlock(mm); @@ -1274,6 +1290,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { struct madvise_info behavior_info; + struct mmu_gather tlb; struct blk_plug plug; int ret = -EINVAL; @@ -1282,13 +1299,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, if (!behavior_info.behavior_valid) return ret; - ret = madvise_start(mm, behavior_info, &plug); + ret = madvise_start(mm, behavior_info, &tlb, &plug); if (ret != 0) return ret; - ret = madvise_one_range(mm, start, len_in, behavior); + ret = madvise_one_range(mm, start, len_in, behavior, &tlb); - madvise_finish(mm, behavior_info, &plug); + madvise_finish(mm, behavior_info, &tlb, &plug); return ret; } @@ -1310,6 +1327,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t total_len; unsigned int f_flags; struct madvise_info behavior_info; + struct mmu_gather tlb; struct blk_plug plug; if (flags != 0) { @@ -1358,20 +1376,20 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, total_len = iov_iter_count(&iter); - ret = madvise_start(mm, behavior_info, &plug); + ret = madvise_start(mm, behavior_info, &tlb, &plug); if (ret != 0) goto release_mm; while (iov_iter_count(&iter)) { iovec = iov_iter_iovec(&iter); ret = madvise_one_range(mm, (unsigned long)iovec.iov_base, - iovec.iov_len, behavior); + iovec.iov_len, behavior, &tlb); if (ret < 0) break; iov_iter_advance(&iter, iovec.iov_len); } - madvise_finish(mm, behavior_info, &plug); + madvise_finish(mm, behavior_info, &tlb, &plug); if (ret == 0) ret = total_len - iov_iter_count(&iter); diff --git a/mm/memory.c b/mm/memory.c index 12a7b2094434..a00fb705a77f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1541,7 +1541,7 @@ void unmap_page_range(struct mmu_gather *tlb, } -static void unmap_single_vma(struct mmu_gather *tlb, +void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) -- 2.25.1