> On 20 Nov 2019, at 21:13, Theodore Y. Ts'o <tytso@xxxxxxx> wrote: > > Hi Alex, > > A couple of comments. First, please separate this patch so that these > two separate pieces of functionality can be reviewed and tested > separately: > And this is the second one Thanks, Alex >From d2ff76d76320ade5f53002aa522b6eccfa058d47 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev <bzzz@xxxxxxxxxxxxx> Date: Thu, 21 Nov 2019 10:00:07 +0300 Subject: [PATCH 2/2] ext4: prefetch bitmaps during block allocation when the cache is cold reading bitmaps one by one can slowdown the process significantly, especially on legacy rotating drives. --- fs/ext4/balloc.c | 12 ++++++++++-- fs/ext4/ext4.h | 4 +++- fs/ext4/mballoc.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/mballoc.h | 1 + fs/ext4/sysfs.c | 2 ++ 5 files changed, 65 insertions(+), 4 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0b202e00d93f..76547601384b 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -404,7 +404,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, * Return buffer_head on success or NULL in case of failure. */ struct buffer_head * -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + int ignore_locked) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -435,6 +436,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) if (bitmap_uptodate(bh)) goto verify; + if (ignore_locked && buffer_locked(bh)) { + /* buffer under IO already, do not wait + * if called for prefetching */ + err = 0; + goto out; + } + lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); @@ -524,7 +532,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) struct buffer_head *bh; int err; - bh = ext4_read_block_bitmap_nowait(sb, block_group); + bh = ext4_read_block_bitmap_nowait(sb, block_group, 1); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d4e47fdad87c..2320d7e2f8d6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1482,6 +1482,7 @@ struct ext4_sb_info { unsigned long s_mb_last_start; unsigned int s_mb_toscan0; unsigned int s_mb_toscan1; + unsigned int s_mb_prefetch; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -2335,7 +2336,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, - ext4_group_t block_group); + ext4_group_t block_group, + int ignore_locked); extern int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cebd7d8df0b8..eac4ee225527 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) bh[i] = NULL; continue; } - bh[i] = ext4_read_block_bitmap_nowait(sb, group); + bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; @@ -2095,6 +2095,48 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } +/* + * each allocation context (i.e. a thread doing allocation) has own + * sliding prefetch window of @s_mb_prefetch size which starts at the + * very first goal and moves ahead of scaning. + * a side effect is that subsequent allocations will likely find + * the bitmaps in cache or at least in-flight. + */ +static void +ext4_mb_prefetch(struct ext4_allocation_context *ac, + ext4_group_t start) +{ + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp; + ext4_group_t group = start; + struct buffer_head *bh; + int nr; + + /* batch prefetching to get few READs in flight */ + if (group + (sbi->s_mb_prefetch >> 1) < ac->ac_prefetch) + return; + + nr = sbi->s_mb_prefetch; + while (nr > 0) { + if (++group >= ngroups) + group = 0; + if (unlikely(group == start)) + break; + grp = ext4_get_group_info(ac->ac_sb, group); + /* ignore empty groups - those will be skipped + * during the scanning as well */ + if (grp->bb_free == 0) + continue; + nr--; + if (!EXT4_MB_GRP_NEED_INIT(grp)) + continue; + bh = ext4_read_block_bitmap_nowait(ac->ac_sb, group, 1); + brelse(bh); + } + ac->ac_prefetch = group; +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -2160,6 +2202,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * cr == 0 try to get exact allocation, * cr == 3 try to get anything */ + + ac->ac_prefetch = ac->ac_g_ex.fe_group; + repeat: for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; @@ -2187,6 +2232,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) if (group >= ngroups) group = 0; + ext4_mb_prefetch(ac, group); + /* This now checks without needing the buddy page */ ret = ext4_mb_good_group(ac, group, cr); if (ret <= 0) { @@ -2882,6 +2929,7 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) } sbi->s_mb_toscan0 = 1024; sbi->s_mb_toscan1 = 4096; + sbi->s_mb_prefetch = 32; list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 88c98f17e3d9..9ba5c75e6490 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -175,6 +175,7 @@ struct ext4_allocation_context { struct page *ac_buddy_page; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; + ext4_group_t ac_prefetch; }; #define AC_STATUS_CONTINUE 1 diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index c96ee20f5487..4476d828439b 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -200,6 +200,7 @@ EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0); EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1); +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -232,6 +233,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(journal_task), ATTR_LIST(mb_toscan0), ATTR_LIST(mb_toscan1), + ATTR_LIST(mb_prefetch), NULL, }; ATTRIBUTE_GROUPS(ext4); -- 2.20.1