Since commit 6d2be915e589 ("mm/readahead.c: fix readahead failure for memoryless NUMA nodes and limit readahead max_pages"), ADV_WILLNEED only tries to readahead 512 pages, and the remained part in the advised range fallback on normal readahead. If bdi->ra_pages is set as small, readahead will perform not efficient enough. Increasing read ahead may not be an option since workload may have mixed random and sequential I/O. Improve this situation by maintaining one willneed range maple tree, if read drops in any willneed range, readahead aggressively just like what we did before commit 6d2be915e589. Cc: Mike Snitzer <snitzer@xxxxxxxxxx> Cc: Don Dutile <ddutile@xxxxxxxxxx> Cc: Raghavendra K T <raghavendra.kt@xxxxxxxxxxxxxxxxxx> Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxx> --- fs/file_table.c | 10 ++++++++++ include/linux/fs.h | 15 +++++++++++++++ mm/filemap.c | 5 ++++- mm/internal.h | 7 ++++++- mm/readahead.c | 32 +++++++++++++++++++++++++++++++- 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index b991f90571b4..bb0303683305 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -61,8 +61,18 @@ struct path *backing_file_user_path(struct file *f) } EXPORT_SYMBOL_GPL(backing_file_user_path); +static inline void file_ra_free(struct file_ra_state *ra) +{ + if (ra->need_mt) { + mtree_destroy(ra->need_mt); + kfree(ra->need_mt); + ra->need_mt = NULL; + } +} + static inline void file_free(struct file *f) { + file_ra_free(&f->f_ra); security_file_free(f); if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..bdbd16990072 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> +#include <linux/maple_tree.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -961,6 +962,7 @@ struct fown_struct { * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. + * @need_mt: maple tree for tracking WILL_NEED ranges * * When this structure is passed to ->readahead(), the "most recent" * readahead means the current readahead. @@ -972,6 +974,7 @@ struct file_ra_state { unsigned int ra_pages; unsigned int mmap_miss; loff_t prev_pos; + struct maple_tree *need_mt; }; /* @@ -983,6 +986,18 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } +/* + * Check if @index falls in the madvise/fadvise WILLNEED window. + */ +static inline bool ra_index_in_need_range(struct file_ra_state *ra, + pgoff_t index) +{ + if (ra->need_mt) + return mtree_load(ra->need_mt, index) != NULL; + + return false; +} + /* * f_{lock,count,pos_lock} members can be highly contended and share * the same cacheline. f_{lock,mode} are very frequently used together diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db..0ffe63d58421 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3147,7 +3147,10 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); - ra->size = ra->ra_pages; + if (ra_index_in_need_range(ra, vmf->pgoff)) + ra->size = inode_to_bdi(mapping->host)->io_pages; + else + ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; page_cache_ra_order(&ractl, ra, 0); diff --git a/mm/internal.h b/mm/internal.h index f309a010d50f..17bd970ff23c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -120,13 +120,18 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); +void file_ra_add_need_range(struct file_ra_state *ra, pgoff_t start, + pgoff_t end); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { - DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); + struct file_ra_state *ra = &file->f_ra; + DEFINE_READAHEAD(ractl, file, ra, mapping, index); + + file_ra_add_need_range(ra, index, index + nr_to_read); force_page_cache_ra(&ractl, nr_to_read); } diff --git a/mm/readahead.c b/mm/readahead.c index 23620c57c122..0882ceecf9ff 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -140,9 +140,38 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; + ra->need_mt = NULL; } EXPORT_SYMBOL_GPL(file_ra_state_init); +static void file_ra_setup_need_mt(struct file_ra_state *ra) +{ + struct maple_tree *mt = kzalloc(sizeof(*mt), GFP_KERNEL); + + if (!mt) + return; + + mt_init(mt); + if (cmpxchg(&ra->need_mt, NULL, mt) != NULL) + kfree(mt); +} + +/* Maintain one willneed range hint for speedup readahead */ +void file_ra_add_need_range(struct file_ra_state *ra, pgoff_t start, + pgoff_t end) +{ + /* ignore small willneed range */ + if (end - start < 4 * ra->ra_pages) + return; + + if (!ra->need_mt) + file_ra_setup_need_mt(ra); + + if (ra->need_mt) + mtree_insert_range(ra->need_mt, start, end, (void *)1, + GFP_KERNEL); +} + static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; @@ -552,9 +581,10 @@ static void ondemand_readahead(struct readahead_control *ractl, { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); struct file_ra_state *ra = ractl->ra; - unsigned long max_pages = ra->ra_pages; unsigned long add_pages; pgoff_t index = readahead_index(ractl); + unsigned long max_pages = ra_index_in_need_range(ra, index) ? + bdi->io_pages : ra->ra_pages; pgoff_t expected, prev_index; unsigned int order = folio ? folio_order(folio) : 0; -- 2.41.0