On Jul 19, 2023, at 12:49 PM, Alex Zhuravlev <azhuravlev@xxxxxxxxxxxxx> wrote: > > In some cases when a lot of extents are created initially by sparse file > writes, they get merged over time, but there is no way to merge blocks in > different indexes. > > For example, if a file is written synchronously, all even blocks first, > then odd blocks. The resulting extents tree looks like the following in > "debugfs stat" output, often with only a single block in each index/leaf: > > EXTENTS: > (ETB0):33796 > (ETB1):33795 > (0-677):2588672-2589349 > (ETB1):2590753 > (678):2589350 > (ETB1):2590720 > (679-1357):2589351-2590029 > (ETB1):2590752 > (1358):2590030 > (ETB1):2590721 > (1359-2037):2590031-2590709 > (ETB1):2590751 > (2038):2590710 > (ETB1):2590722 > : > : > > With the patch applied the index and lead blocks are properly merged > (0.6% slower under this random sync write workload, but later read > IOPS are greatly reduced): > > EXTENTS: > (ETB0):33796 > (ETB1):2590736 > (0-2047):2588672-2590719 > (2048-11999):2592768-2602719 > > Originally the problem was hit with a real application operating on > huge datasets and with just 27371 extents > > inode has invalid extent depth: 6 > > problem occurred. With the patch applied the application succeeded > having finally 73637 extents in 3-level tree. > > Signed-off-by: Alex Zhuravlev <bzzz@xxxxxxxxxxxxx> Reviewed-by: Andreas Dilger <adilger@xxxxxxxxx> > --- > fs/ext4/extents.c | 185 ++++++++++++++++++++++++++++++++++++++++-- > fs/jbd2/transaction.c | 1 + > 2 files changed, 180 insertions(+), 6 deletions(-) > > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index e4115d338f10..5b0e05cd5595 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -1885,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, > * This function tries to merge the @ex extent to neighbours in the tree, then > * tries to collapse the extent tree into the inode. > */ > -static void ext4_ext_try_to_merge(handle_t *handle, > +static int ext4_ext_try_to_merge(handle_t *handle, > struct inode *inode, > struct ext4_ext_path *path, > struct ext4_extent *ex) > @@ -1902,9 +1902,178 @@ static void ext4_ext_try_to_merge(handle_t *handle, > merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); > > if (!merge_done) > - (void) ext4_ext_try_to_merge_right(inode, path, ex); > + merge_done = ext4_ext_try_to_merge_right(inode, path, ex); > > ext4_ext_try_to_merge_up(handle, inode, path); > + > + return merge_done; > +} > + > +/* > + * This function tries to merge blocks from @path into @npath > + */ > +static int ext4_ext_merge_blocks(handle_t *handle, > + struct inode *inode, > + struct ext4_ext_path *path, > + struct ext4_ext_path *npath) > +{ > + unsigned int depth = ext_depth(inode); > + int used, nused, free, i, k, err; > + ext4_lblk_t next; > + > + if (path[depth].p_hdr == npath[depth].p_hdr) > + return 0; > + > + used = le16_to_cpu(path[depth].p_hdr->eh_entries); > + free = le16_to_cpu(npath[depth].p_hdr->eh_max) - > + le16_to_cpu(npath[depth].p_hdr->eh_entries); > + if (free < used) > + return 0; > + > + err = ext4_ext_get_access(handle, inode, path + depth); > + if (err) > + return err; > + err = ext4_ext_get_access(handle, inode, npath + depth); > + if (err) > + return err; > + > + /* move entries from the current leave to the next one */ > + nused = le16_to_cpu(npath[depth].p_hdr->eh_entries); > + memmove(EXT_FIRST_EXTENT(npath[depth].p_hdr) + used, > + EXT_FIRST_EXTENT(npath[depth].p_hdr), > + nused * sizeof(struct ext4_extent)); > + memcpy(EXT_FIRST_EXTENT(npath[depth].p_hdr), > + EXT_FIRST_EXTENT(path[depth].p_hdr), > + used * sizeof(struct ext4_extent)); > + le16_add_cpu(&npath[depth].p_hdr->eh_entries, used); > + le16_add_cpu(&path[depth].p_hdr->eh_entries, -used); > + ext4_ext_try_to_merge_right(inode, npath, > + EXT_FIRST_EXTENT(npath[depth].p_hdr)); > + > + err = ext4_ext_dirty(handle, inode, path + depth); > + if (err) > + return err; > + err = ext4_ext_dirty(handle, inode, npath + depth); > + if (err) > + return err; > + > + /* otherwise the index won't get corrected */ > + npath[depth].p_ext = EXT_FIRST_EXTENT(npath[depth].p_hdr); > + err = ext4_ext_correct_indexes(handle, inode, npath); > + if (err) > + return err; > + > + for (i = depth - 1; i >= 0; i--) { > + > + next = ext4_idx_pblock(path[i].p_idx); > + ext4_free_blocks(handle, inode, NULL, next, 1, > + EXT4_FREE_BLOCKS_METADATA | > + EXT4_FREE_BLOCKS_FORGET); > + err = ext4_ext_get_access(handle, inode, path + i); > + if (err) > + return err; > + le16_add_cpu(&path[i].p_hdr->eh_entries, -1); > + if (le16_to_cpu(path[i].p_hdr->eh_entries) == 0) { > + /* whole index block collapsed, go up */ > + continue; > + } > + /* remove index pointer */ > + used = EXT_LAST_INDEX(path[i].p_hdr) - path[i].p_idx + 1; > + memmove(path[i].p_idx, path[i].p_idx + 1, > + used * sizeof(struct ext4_extent_idx)); > + > + err = ext4_ext_dirty(handle, inode, path + i); > + if (err) > + return err; > + > + if (path[i].p_hdr == npath[i].p_hdr) > + break; > + > + /* try to move index pointers */ > + used = le16_to_cpu(path[i].p_hdr->eh_entries); > + free = le16_to_cpu(npath[i].p_hdr->eh_max) - > + le16_to_cpu(npath[i].p_hdr->eh_entries); > + if (used > free) > + break; > + err = ext4_ext_get_access(handle, inode, npath + i); > + if (err) > + return err; > + memmove(EXT_FIRST_INDEX(npath[i].p_hdr) + used, > + EXT_FIRST_INDEX(npath[i].p_hdr), > + npath[i].p_hdr->eh_entries * sizeof(struct ext4_extent_idx)); > + memcpy(EXT_FIRST_INDEX(npath[i].p_hdr), EXT_FIRST_INDEX(path[i].p_hdr), > + used * sizeof(struct ext4_extent_idx)); > + le16_add_cpu(&path[i].p_hdr->eh_entries, -used); > + le16_add_cpu(&npath[i].p_hdr->eh_entries, used); > + err = ext4_ext_dirty(handle, inode, path + i); > + if (err) > + return err; > + err = ext4_ext_dirty(handle, inode, npath + i); > + if (err) > + return err; > + > + /* correct index above */ > + for (k = i; k > 0; k--) { > + err = ext4_ext_get_access(handle, inode, npath + k - 1); > + if (err) > + return err; > + npath[k-1].p_idx->ei_block = > + EXT_FIRST_INDEX(npath[k].p_hdr)->ei_block; > + err = ext4_ext_dirty(handle, inode, npath + k - 1); > + if (err) > + return err; > + } > + } > + > + /* > + * TODO: given we've got two paths, it should be possible to > + * collapse those two blocks into the root one in some cases > + */ > + return 1; > +} > + > +static int ext4_ext_try_to_merge_blocks(handle_t *handle, > + struct inode *inode, > + struct ext4_ext_path *path) > +{ > + struct ext4_ext_path *npath = NULL; > + unsigned int depth = ext_depth(inode); > + ext4_lblk_t next; > + int used, rc = 0; > + > + if (depth == 0) > + return 0; > + > + used = le16_to_cpu(path[depth].p_hdr->eh_entries); > + /* don't be too agressive as checking space in > + * the next block is not free */ > + if (used > ext4_ext_space_block(inode, 0) / 4) > + return 0; > + > + /* try to merge to the next block */ > + next = ext4_ext_next_leaf_block(path); > + if (next == EXT_MAX_BLOCKS) > + return 0; > + npath = ext4_find_extent(inode, next, NULL, 0); > + if (IS_ERR(npath)) > + return 0; > + rc = ext4_ext_merge_blocks(handle, inode, path, npath); > + ext4_ext_drop_refs(npath); > + kfree(npath); > + if (rc) > + return rc > 0 ? 0 : rc; > + > + /* try to merge with the previous block */ > + if (EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block == 0) > + return 0; > + next = EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block - 1; > + npath = ext4_find_extent(inode, next, NULL, 0); > + if (IS_ERR(npath)) > + return 0; > + rc = ext4_ext_merge_blocks(handle, inode, npath, path); > + ext4_ext_drop_refs(npath); > + kfree(npath); > + return rc > 0 ? 0 : rc; > } > > /* > @@ -1976,6 +2145,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > int depth, len, err; > ext4_lblk_t next; > int mb_flags = 0, unwritten; > + int merged = 0; > > if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) > mb_flags |= EXT4_MB_DELALLOC_RESERVED; > @@ -2167,8 +2337,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > merge: > /* try to merge extents */ > if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) > - ext4_ext_try_to_merge(handle, inode, path, nearex); > - > + merged = ext4_ext_try_to_merge(handle, inode, path, nearex); > > /* time to correct all indexes above */ > err = ext4_ext_correct_indexes(handle, inode, path); > @@ -2176,6 +2345,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > goto cleanup; > > err = ext4_ext_dirty(handle, inode, path + path->p_depth); > + if (!err && merged) > + err = ext4_ext_try_to_merge_blocks(handle, inode, path); > > cleanup: > ext4_free_ext_path(npath); > @@ -3765,7 +3936,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, > /* note: ext4_ext_correct_indexes() isn't needed here because > * borders are not changed > */ > - ext4_ext_try_to_merge(handle, inode, path, ex); > + if (ext4_ext_try_to_merge(handle, inode, path, ex)) > + ext4_ext_try_to_merge_blocks(handle, inode, path); > > /* Mark modified extent as dirty */ > err = ext4_ext_dirty(handle, inode, path + path->p_depth); > @@ -3828,7 +4000,8 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, > /* note: ext4_ext_correct_indexes() isn't needed here because > * borders are not changed > */ > - ext4_ext_try_to_merge(handle, inode, path, ex); > + if (ext4_ext_try_to_merge(handle, inode, path, ex)) > + ext4_ext_try_to_merge_blocks(handle, inode, path); > > /* Mark modified extent as dirty */ > err = ext4_ext_dirty(handle, inode, path + path->p_depth); > diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c > index 18611241f451..7421f2af9cf2 100644 > --- a/fs/jbd2/transaction.c > +++ b/fs/jbd2/transaction.c > @@ -513,6 +513,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, > } > rsv_handle->h_reserved = 1; > rsv_handle->h_journal = journal; > + rsv_handle->h_revoke_credits = revoke_records; > handle->h_rsv_handle = rsv_handle; > } > handle->h_revoke_credits = revoke_records; > -- > 2.40.1 > Cheers, Andreas
Attachment:
signature.asc
Description: Message signed with OpenPGP