Hello Alex, Thank you for this important topic. I am going to test your patch and give feedback. But this requires some time. Now I want to share my thougths about this topic. Here are slides from LAD2019 “Ldiskfs block allocator and aged file system" https://www.eofs.eu/_media/events/lad19/14_artem_blagodarenko-ldiskfs_block_allocator.pdf There is also patch https://lists.openwall.net/linux-ext4/2019/03/11/5 that was sent here, but it require to be improved. Best regards, Artem Blagodarenko > On 20 Nov 2019, at 13:35, Alex Zhuravlev <azhuravlev@xxxxxxxxxxxxx> wrote: > > Hi, > > We’ve seen few reports where a huge fragmented filesystem spends a lot of time looking for a good chunks of free space. > Two issues have been identified so far: > 1) mballoc tries too hard to find the best chunk which is counterproductive - it makes sense to limit this process > 2) during scanning the bitmaps are loaded one by one, synchronously - it makes sense to prefetch few groups at once > Here is a patch for comments, not really tested at scale, but it’d be great to see the comments. > > Thanks in advance, Alex > > diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c > index 0b202e00d93f..76547601384b 100644 > --- a/fs/ext4/balloc.c > +++ b/fs/ext4/balloc.c > @@ -404,7 +404,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, > * Return buffer_head on success or NULL in case of failure. > */ > struct buffer_head * > -ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) > +ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, > + int ignore_locked) > { > struct ext4_group_desc *desc; > struct ext4_sb_info *sbi = EXT4_SB(sb); > @@ -435,6 +436,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) > if (bitmap_uptodate(bh)) > goto verify; > > + if (ignore_locked && buffer_locked(bh)) { > + /* buffer under IO already, do not wait > + * if called for prefetching */ > + err = 0; > + goto out; > + } > + > lock_buffer(bh); > if (bitmap_uptodate(bh)) { > unlock_buffer(bh); > @@ -524,7 +532,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) > struct buffer_head *bh; > int err; > > - bh = ext4_read_block_bitmap_nowait(sb, block_group); > + bh = ext4_read_block_bitmap_nowait(sb, block_group, 1); > if (IS_ERR(bh)) > return bh; > err = ext4_wait_block_bitmap(sb, block_group, bh); > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 03db3e71676c..2320d7e2f8d6 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1480,6 +1480,9 @@ struct ext4_sb_info { > /* where last allocation was done - for stream allocation */ > unsigned long s_mb_last_group; > unsigned long s_mb_last_start; > + unsigned int s_mb_toscan0; > + unsigned int s_mb_toscan1; > + unsigned int s_mb_prefetch; > > /* stats for buddy allocator */ > atomic_t s_bal_reqs; /* number of reqs with len > 1 */ > @@ -2333,7 +2336,8 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, > extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); > > extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, > - ext4_group_t block_group); > + ext4_group_t block_group, > + int ignore_locked); > extern int ext4_wait_block_bitmap(struct super_block *sb, > ext4_group_t block_group, > struct buffer_head *bh); > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c > index a3e2767bdf2f..eac4ee225527 100644 > --- a/fs/ext4/mballoc.c > +++ b/fs/ext4/mballoc.c > @@ -861,7 +861,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) > bh[i] = NULL; > continue; > } > - bh[i] = ext4_read_block_bitmap_nowait(sb, group); > + bh[i] = ext4_read_block_bitmap_nowait(sb, group, 0); > if (IS_ERR(bh[i])) { > err = PTR_ERR(bh[i]); > bh[i] = NULL; > @@ -2095,10 +2095,52 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, > return 0; > } > > +/* > + * each allocation context (i.e. a thread doing allocation) has own > + * sliding prefetch window of @s_mb_prefetch size which starts at the > + * very first goal and moves ahead of scaning. > + * a side effect is that subsequent allocations will likely find > + * the bitmaps in cache or at least in-flight. > + */ > +static void > +ext4_mb_prefetch(struct ext4_allocation_context *ac, > + ext4_group_t start) > +{ > + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); > + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); > + struct ext4_group_info *grp; > + ext4_group_t group = start; > + struct buffer_head *bh; > + int nr; > + > + /* batch prefetching to get few READs in flight */ > + if (group + (sbi->s_mb_prefetch >> 1) < ac->ac_prefetch) > + return; > + > + nr = sbi->s_mb_prefetch; > + while (nr > 0) { > + if (++group >= ngroups) > + group = 0; > + if (unlikely(group == start)) > + break; > + grp = ext4_get_group_info(ac->ac_sb, group); > + /* ignore empty groups - those will be skipped > + * during the scanning as well */ > + if (grp->bb_free == 0) > + continue; > + nr--; > + if (!EXT4_MB_GRP_NEED_INIT(grp)) > + continue; > + bh = ext4_read_block_bitmap_nowait(ac->ac_sb, group, 1); > + brelse(bh); > + } > + ac->ac_prefetch = group; > +} > + > static noinline_for_stack int > ext4_mb_regular_allocator(struct ext4_allocation_context *ac) > { > - ext4_group_t ngroups, group, i; > + ext4_group_t ngroups, toscan, group, i; > int cr; > int err = 0, first_err = 0; > struct ext4_sb_info *sbi; > @@ -2160,6 +2202,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) > * cr == 0 try to get exact allocation, > * cr == 3 try to get anything > */ > + > + ac->ac_prefetch = ac->ac_g_ex.fe_group; > + > repeat: > for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { > ac->ac_criteria = cr; > @@ -2169,7 +2214,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) > */ > group = ac->ac_g_ex.fe_group; > > - for (i = 0; i < ngroups; group++, i++) { > + /* limit number of groups to scan at the first two rounds > + * when we hope to find something really good */ > + toscan = ngroups; > + if (cr == 0) > + toscan = sbi->s_mb_toscan0; > + else if (cr == 1) > + toscan = sbi->s_mb_toscan1; > + > + for (i = 0; i < toscan; group++, i++) { > int ret = 0; > cond_resched(); > /* > @@ -2179,6 +2232,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) > if (group >= ngroups) > group = 0; > > + ext4_mb_prefetch(ac, group); > + > /* This now checks without needing the buddy page */ > ret = ext4_mb_good_group(ac, group, cr); > if (ret <= 0) { > @@ -2872,6 +2927,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) > bio_put(discard_bio); > } > } > + sbi->s_mb_toscan0 = 1024; > + sbi->s_mb_toscan1 = 4096; > + sbi->s_mb_prefetch = 32; > > list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) > ext4_free_data_in_buddy(sb, entry); > diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h > index 88c98f17e3d9..9ba5c75e6490 100644 > --- a/fs/ext4/mballoc.h > +++ b/fs/ext4/mballoc.h > @@ -175,6 +175,7 @@ struct ext4_allocation_context { > struct page *ac_buddy_page; > struct ext4_prealloc_space *ac_pa; > struct ext4_locality_group *ac_lg; > + ext4_group_t ac_prefetch; > }; > > #define AC_STATUS_CONTINUE 1 > diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c > index eb1efad0e20a..4476d828439b 100644 > --- a/fs/ext4/sysfs.c > +++ b/fs/ext4/sysfs.c > @@ -198,6 +198,9 @@ EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); > EXT4_ATTR(first_error_time, 0444, first_error_time); > EXT4_ATTR(last_error_time, 0444, last_error_time); > EXT4_ATTR(journal_task, 0444, journal_task); > +EXT4_RW_ATTR_SBI_UI(mb_toscan0, s_mb_toscan0); > +EXT4_RW_ATTR_SBI_UI(mb_toscan1, s_mb_toscan1); > +EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); > > static unsigned int old_bump_val = 128; > EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); > @@ -228,6 +231,9 @@ static struct attribute *ext4_attrs[] = { > ATTR_LIST(first_error_time), > ATTR_LIST(last_error_time), > ATTR_LIST(journal_task), > + ATTR_LIST(mb_toscan0), > + ATTR_LIST(mb_toscan1), > + ATTR_LIST(mb_prefetch), > NULL, > }; > ATTRIBUTE_GROUPS(ext4); >