Hi, Please, consider for inclusion the patch attempting to handle the problem with deep extent tree. Thanks, Alex >From 0dd791c266e2d00c3217d0f1b836abdbfd4f146f Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev <bzzz@xxxxxxxxxxxxx> Date: Wed, 19 Jul 2023 21:22:20 +0300 Subject: [PATCH 1/2] In some cases when a lot of extents are created initially by sparse file writes, they get merged over time, but there is no way to merge blocks in different indexes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For example, if a file is written synchronously, all even blocks first, then odd blocks. The resulting extents tree looks like the following in "debugfs stat" output, often with only a single block in each index/leaf: EXTENTS: (ETB0):33796 (ETB1):33795 (0-677):2588672-2589349 (ETB1):2590753 (678):2589350 (ETB1):2590720 (679-1357):2589351-2590029 (ETB1):2590752 (1358):2590030 (ETB1):2590721 (1359-2037):2590031-2590709 (ETB1):2590751 (2038):2590710 (ETB1):2590722 : : With the patch applied the index and lead blocks are properly merged (0.6% slower under this random sync write workload, but later read IOPS are greatly reduced): EXTENTS: (ETB0):33796 (ETB1):2590736 (0-2047):2588672-2590719 (2048-11999):2592768-2602719 Originally the problem was hit with a real application operating on huge datasets and with just 27371 extents "inode has invalid extent depth: 6” problem occurred. With the patch applied the application succeeded having finally 73637 in 3-level tree. Signed-off-by: Alex Zhuravlev <bzzz@xxxxxxxxxxxxx> --- fs/ext4/extents.c | 185 ++++++++++++++++++++++++++++++++++++++++-- fs/jbd2/transaction.c | 1 + 2 files changed, 180 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e57054bdc5fd..a2de6b863df1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1885,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, * This function tries to merge the @ex extent to neighbours in the tree, then * tries to collapse the extent tree into the inode. */ -static void ext4_ext_try_to_merge(handle_t *handle, +static int ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *ex) @@ -1902,9 +1902,178 @@ static void ext4_ext_try_to_merge(handle_t *handle, merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); if (!merge_done) - (void) ext4_ext_try_to_merge_right(inode, path, ex); + merge_done = ext4_ext_try_to_merge_right(inode, path, ex); ext4_ext_try_to_merge_up(handle, inode, path); + + return merge_done; +} + +/* + * This function tries to merge blocks from @path into @npath + */ +static int ext4_ext_merge_blocks(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path, + struct ext4_ext_path *npath) +{ + unsigned int depth = ext_depth(inode); + int used, nused, free, i, k, err; + ext4_fsblk_t next; + + if (path[depth].p_hdr == npath[depth].p_hdr) + return 0; + + used = le16_to_cpu(path[depth].p_hdr->eh_entries); + free = le16_to_cpu(npath[depth].p_hdr->eh_max) - + le16_to_cpu(npath[depth].p_hdr->eh_entries); + if (free < used) + return 0; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + err = ext4_ext_get_access(handle, inode, npath + depth); + if (err) + return err; + + /* move entries from the current leave to the next one */ + nused = le16_to_cpu(npath[depth].p_hdr->eh_entries); + memmove(EXT_FIRST_EXTENT(npath[depth].p_hdr) + used, + EXT_FIRST_EXTENT(npath[depth].p_hdr), + nused * sizeof(struct ext4_extent)); + memcpy(EXT_FIRST_EXTENT(npath[depth].p_hdr), + EXT_FIRST_EXTENT(path[depth].p_hdr), + used * sizeof(struct ext4_extent)); + le16_add_cpu(&npath[depth].p_hdr->eh_entries, used); + le16_add_cpu(&path[depth].p_hdr->eh_entries, -used); + ext4_ext_try_to_merge_right(inode, npath, + EXT_FIRST_EXTENT(npath[depth].p_hdr)); + + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + return err; + err = ext4_ext_dirty(handle, inode, npath + depth); + if (err) + return err; + + /* otherwise the index won't get corrected */ + npath[depth].p_ext = EXT_FIRST_EXTENT(npath[depth].p_hdr); + err = ext4_ext_correct_indexes(handle, inode, npath); + if (err) + return err; + + for (i = depth - 1; i >= 0; i--) { + + next = ext4_idx_pblock(path[i].p_idx); + ext4_free_blocks(handle, inode, NULL, next, 1, + EXT4_FREE_BLOCKS_METADATA | + EXT4_FREE_BLOCKS_FORGET); + err = ext4_ext_get_access(handle, inode, path + i); + if (err) + return err; + le16_add_cpu(&path[i].p_hdr->eh_entries, -1); + if (le16_to_cpu(path[i].p_hdr->eh_entries) == 0) { + /* whole index block collapsed, go up */ + continue; + } + /* remove index pointer */ + used = EXT_LAST_INDEX(path[i].p_hdr) - path[i].p_idx + 1; + memmove(path[i].p_idx, path[i].p_idx + 1, + used * sizeof(struct ext4_extent_idx)); + + err = ext4_ext_dirty(handle, inode, path + i); + if (err) + return err; + + if (path[i].p_hdr == npath[i].p_hdr) + break; + + /* try to move index pointers */ + used = le16_to_cpu(path[i].p_hdr->eh_entries); + free = le16_to_cpu(npath[i].p_hdr->eh_max) - + le16_to_cpu(npath[i].p_hdr->eh_entries); + if (used > free) + break; + err = ext4_ext_get_access(handle, inode, npath + i); + if (err) + return err; + memmove(EXT_FIRST_INDEX(npath[i].p_hdr) + used, + EXT_FIRST_INDEX(npath[i].p_hdr), + npath[i].p_hdr->eh_entries * sizeof(struct ext4_extent_idx)); + memcpy(EXT_FIRST_INDEX(npath[i].p_hdr), EXT_FIRST_INDEX(path[i].p_hdr), + used * sizeof(struct ext4_extent_idx)); + le16_add_cpu(&path[i].p_hdr->eh_entries, -used); + le16_add_cpu(&npath[i].p_hdr->eh_entries, used); + err = ext4_ext_dirty(handle, inode, path + i); + if (err) + return err; + err = ext4_ext_dirty(handle, inode, npath + i); + if (err) + return err; + + /* correct index above */ + for (k = i; k > 0; k--) { + err = ext4_ext_get_access(handle, inode, npath + k - 1); + if (err) + return err; + npath[k-1].p_idx->ei_block = + EXT_FIRST_INDEX(npath[k].p_hdr)->ei_block; + err = ext4_ext_dirty(handle, inode, npath + k - 1); + if (err) + return err; + } + } + + /* + * TODO: given we've got two paths, it should be possible to + * collapse those two blocks into the root one in some cases + */ + return 1; +} + +static int ext4_ext_try_to_merge_blocks(handle_t *handle, + struct inode *inode, + struct ext4_ext_path *path) +{ + struct ext4_ext_path *npath = NULL; + unsigned int depth = ext_depth(inode); + ext4_lblk_t next; + int used, rc = 0; + + if (depth == 0) + return 0; + + used = le16_to_cpu(path[depth].p_hdr->eh_entries); + /* don't be too agressive as checking space in + * the next block is not free */ + if (used > ext4_ext_space_block(inode, 0) / 4) + return 0; + + /* try to merge to the next block */ + next = ext4_ext_next_leaf_block(path); + if (next == EXT_MAX_BLOCKS) + return 0; + npath = ext4_find_extent(inode, next, NULL, 0); + if (IS_ERR(npath)) + return 0; + rc = ext4_ext_merge_blocks(handle, inode, path, npath); + ext4_ext_drop_refs(npath); + kfree(npath); + if (rc) + return rc > 0 ? 0 : rc; + + /* try to merge with the previous block */ + if (EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block == 0) + return 0; + next = EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block - 1; + npath = ext4_find_extent(inode, next, NULL, 0); + if (IS_ERR(npath)) + return 0; + rc = ext4_ext_merge_blocks(handle, inode, npath, path); + ext4_ext_drop_refs(npath); + kfree(npath); + return rc > 0 ? 0 : rc; } /* @@ -1976,6 +2145,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, int depth, len, err; ext4_lblk_t next; int mb_flags = 0, unwritten; + int merged = 0; if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) mb_flags |= EXT4_MB_DELALLOC_RESERVED; @@ -2167,8 +2337,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, merge: /* try to merge extents */ if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) - ext4_ext_try_to_merge(handle, inode, path, nearex); - + merged = ext4_ext_try_to_merge(handle, inode, path, nearex); /* time to correct all indexes above */ err = ext4_ext_correct_indexes(handle, inode, path); @@ -2176,6 +2345,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, goto cleanup; err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (!err && merged) + err = ext4_ext_try_to_merge_blocks(handle, inode, path); cleanup: ext4_free_ext_path(npath); @@ -3741,7 +3912,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ - ext4_ext_try_to_merge(handle, inode, path, ex); + if (ext4_ext_try_to_merge(handle, inode, path, ex)) + ext4_ext_try_to_merge_blocks(handle, inode, path); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); @@ -3804,7 +3976,8 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, /* note: ext4_ext_correct_indexes() isn't needed here because * borders are not changed */ - ext4_ext_try_to_merge(handle, inode, path, ex); + if (ext4_ext_try_to_merge(handle, inode, path, ex)) + ext4_ext_try_to_merge_blocks(handle, inode, path); /* Mark modified extent as dirty */ err = ext4_ext_dirty(handle, inode, path + path->p_depth); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cb0b8d6fc0c6..4cd738fa408e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -513,6 +513,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, } rsv_handle->h_reserved = 1; rsv_handle->h_journal = journal; + rsv_handle->h_revoke_credits = revoke_records; handle->h_rsv_handle = rsv_handle; } handle->h_revoke_credits = revoke_records; -- 2.44.0