madvise(MADV_POPULATE_READ) tries to populate all page tables in the specific range, so it is usually sequential IO if VMA is backed by file. Set ra_pages as device max request size for the involved readahead in the ADV_POPULATE_READ, this way reduces latency of madvise(MADV_POPULATE_READ) to 1/10 when running madvise(MADV_POPULATE_READ) over one 1GB file with usual(default) 128KB of read_ahead_kb. Cc: David Hildenbrand <david@xxxxxxxxxx> Cc: Matthew Wilcox <willy@xxxxxxxxxxxxx> Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx> Cc: Christian Brauner <brauner@xxxxxxxxxx> Cc: Don Dutile <ddutile@xxxxxxxxxx> Cc: Rafael Aquini <raquini@xxxxxxxxxx> Cc: Dave Chinner <david@xxxxxxxxxxxxx> Cc: Mike Snitzer <snitzer@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> --- mm/madvise.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 912155a94ed5..db5452c8abdd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -900,6 +900,37 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, return -EINVAL; } +static void madvise_restore_ra_win(struct file **file, unsigned int ra_pages) +{ + if (*file) { + struct file *f = *file; + + f->f_ra.ra_pages = ra_pages; + fput(f); + *file = NULL; + } +} + +static struct file *madvise_override_ra_win(struct file *f, + unsigned long start, unsigned long end, + unsigned int *old_ra_pages) +{ + unsigned int io_pages; + + if (!f || !f->f_mapping || !f->f_mapping->host) + return NULL; + + io_pages = inode_to_bdi(f->f_mapping->host)->io_pages; + if (((end - start) >> PAGE_SHIFT) < io_pages) + return NULL; + + f = get_file(f); + *old_ra_pages = f->f_ra.ra_pages; + f->f_ra.ra_pages = io_pages; + + return f; +} + static long madvise_populate(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, @@ -908,9 +939,21 @@ static long madvise_populate(struct vm_area_struct *vma, const bool write = behavior == MADV_POPULATE_WRITE; struct mm_struct *mm = vma->vm_mm; unsigned long tmp_end; + unsigned int ra_pages; + struct file *file; int locked = 1; long pages; + /* + * In case of file backing mapping, increase readahead window + * for reducing the whole populate latency, and restore it + * after the populate is done + */ + if (behavior == MADV_POPULATE_READ) + file = madvise_override_ra_win(vma->vm_file, start, end, + &ra_pages); + else + file = NULL; *prev = vma; while (start < end) { @@ -920,8 +963,10 @@ static long madvise_populate(struct vm_area_struct *vma, */ if (!vma || start >= vma->vm_end) { vma = vma_lookup(mm, start); - if (!vma) + if (!vma) { + madvise_restore_ra_win(&file, ra_pages); return -ENOMEM; + } } tmp_end = min_t(unsigned long, end, vma->vm_end); @@ -935,6 +980,9 @@ static long madvise_populate(struct vm_area_struct *vma, vma = NULL; } if (pages < 0) { + /* restore ra pages back in case of any failure */ + madvise_restore_ra_win(&file, ra_pages); + switch (pages) { case -EINTR: return -EINTR; @@ -954,6 +1002,8 @@ static long madvise_populate(struct vm_area_struct *vma, } start += pages * PAGE_SIZE; } + + madvise_restore_ra_win(&file, ra_pages); return 0; } -- 2.41.0