On Jun 19, 2023, at 07:20, Alex Zhuravlev <azhuravlev@xxxxxxxxxxxxx> wrote: > > Hi, > > Please have a look at the patch attempting to handle the problem with deep extent tree. > There are cases (rather corner, but still) when a lot of extents are created initially, then > they get merged over time, but there is no way to merge blocks. Here is a simple example: > a file is written synchronously, all even blocks first, then odd blocks. Finally you may find > extents tree like this (data from debugs): > EXTENTS: > (ETB0):33796 > (ETB1):33795 > (0-677):2588672-2589349 > (ETB1):2590753 > (678):2589350 > (ETB1):2590720 > (679-1357):2589351-2590029 > (ETB1):2590752 > (1358):2590030 > (ETB1):2590721 > (1359-2037):2590031-2590709 > (ETB1):2590751 > (2038):2590710 > (ETB1):2590722 > (2039-2047):2590711-2590719 > (2048-2717):2592768-2593437 > (ETB1):2590750 > (2718):2593438 > (ETB1):2590723 > (2719-3397):2593439-2594117 > (ETB1):2590749 > (3398):2594118 > (ETB1):2590724 > (3399-4077):2594119-2594797 > (ETB1):2590748 > (4078):2594798 > (ETB1):2590725 > (4079-4757):2594799-2595477 > (ETB1):2590747 > (4758):2595478 > (ETB1):2590726 > … > Notice the most of the leave blocks have just a single extent, which doesn’t look very optimal. > With the patch applied (0.6% slower): > EXTENTS: > (ETB0):33796 > (ETB1):2590736 > (0-2047):2588672-2590719 > (2048-11999):2592768-2602719 > > Originally the problem was hit with a real application operating on huge datasets and with just > 27371 extents "inode has invalid extent depth: 6” problem occurred. > With the patch applied the application succeeded having finally 73637 in 3-level tree. > > Thanks, Alex We hit some issues with this patch under some heavy usage that have been hard to reproduce in testing, so please hold off any action on this patch. Cheers, Andreas > diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c > index 35703dce23a3..acb64e1f8018 100644 > --- a/fs/ext4/extents.c > +++ b/fs/ext4/extents.c > @@ -1885,7 +1885,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, > * This function tries to merge the @ex extent to neighbours in the tree, then > * tries to collapse the extent tree into the inode. > */ > -static void ext4_ext_try_to_merge(handle_t *handle, > +static int ext4_ext_try_to_merge(handle_t *handle, > struct inode *inode, > struct ext4_ext_path *path, > struct ext4_extent *ex) > @@ -1902,9 +1902,178 @@ static void ext4_ext_try_to_merge(handle_t *handle, > merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1); > > if (!merge_done) > - (void) ext4_ext_try_to_merge_right(inode, path, ex); > + merge_done = ext4_ext_try_to_merge_right(inode, path, ex); > > ext4_ext_try_to_merge_up(handle, inode, path); > + > + return merge_done; > +} > + > +/* > + * This function tries to merge blocks from @path into @npath > + */ > +static int ext4_ext_merge_blocks(handle_t *handle, > + struct inode *inode, > + struct ext4_ext_path *path, > + struct ext4_ext_path *npath) > +{ > + unsigned int depth = ext_depth(inode); > + int used, nused, free, i, k, err; > + ext4_lblk_t next; > + > + if (path[depth].p_hdr == npath[depth].p_hdr) > + return 0; > + > + used = le16_to_cpu(path[depth].p_hdr->eh_entries); > + free = le16_to_cpu(npath[depth].p_hdr->eh_max) - > + le16_to_cpu(npath[depth].p_hdr->eh_entries); > + if (free < used) > + return 0; > + > + err = ext4_ext_get_access(handle, inode, path + depth); > + if (err) > + return err; > + err = ext4_ext_get_access(handle, inode, npath + depth); > + if (err) > + return err; > + > + /* move entries from the current leave to the next one */ > + nused = le16_to_cpu(npath[depth].p_hdr->eh_entries); > + memmove(EXT_FIRST_EXTENT(npath[depth].p_hdr) + used, > + EXT_FIRST_EXTENT(npath[depth].p_hdr), > + nused * sizeof(struct ext4_extent)); > + memcpy(EXT_FIRST_EXTENT(npath[depth].p_hdr), > + EXT_FIRST_EXTENT(path[depth].p_hdr), > + used * sizeof(struct ext4_extent)); > + le16_add_cpu(&npath[depth].p_hdr->eh_entries, used); > + le16_add_cpu(&path[depth].p_hdr->eh_entries, -used); > + ext4_ext_try_to_merge_right(inode, npath, > + EXT_FIRST_EXTENT(npath[depth].p_hdr)); > + > + err = ext4_ext_dirty(handle, inode, path + depth); > + if (err) > + return err; > + err = ext4_ext_dirty(handle, inode, npath + depth); > + if (err) > + return err; > + > + /* otherwise the index won't get corrected */ > + npath[depth].p_ext = EXT_FIRST_EXTENT(npath[depth].p_hdr); > + err = ext4_ext_correct_indexes(handle, inode, npath); > + if (err) > + return err; > + > + for (i = depth - 1; i >= 0; i--) { > + > + next = ext4_idx_pblock(path[i].p_idx); > + ext4_free_blocks(handle, inode, NULL, next, 1, > + EXT4_FREE_BLOCKS_METADATA | > + EXT4_FREE_BLOCKS_FORGET); > + err = ext4_ext_get_access(handle, inode, path + i); > + if (err) > + return err; > + le16_add_cpu(&path[i].p_hdr->eh_entries, -1); > + if (le16_to_cpu(path[i].p_hdr->eh_entries) == 0) { > + /* whole index block collapsed, go up */ > + continue; > + } > + /* remove index pointer */ > + used = EXT_LAST_INDEX(path[i].p_hdr) - path[i].p_idx + 1; > + memmove(path[i].p_idx, path[i].p_idx + 1, > + used * sizeof(struct ext4_extent_idx)); > + > + err = ext4_ext_dirty(handle, inode, path + i); > + if (err) > + return err; > + > + if (path[i].p_hdr == npath[i].p_hdr) > + break; > + > + /* try to move index pointers */ > + used = le16_to_cpu(path[i].p_hdr->eh_entries); > + free = le16_to_cpu(npath[i].p_hdr->eh_max) - > + le16_to_cpu(npath[i].p_hdr->eh_entries); > + if (used > free) > + break; > + err = ext4_ext_get_access(handle, inode, npath + i); > + if (err) > + return err; > + memmove(EXT_FIRST_INDEX(npath[i].p_hdr) + used, > + EXT_FIRST_INDEX(npath[i].p_hdr), > + npath[i].p_hdr->eh_entries * sizeof(struct ext4_extent_idx)); > + memcpy(EXT_FIRST_INDEX(npath[i].p_hdr), EXT_FIRST_INDEX(path[i].p_hdr), > + used * sizeof(struct ext4_extent_idx)); > + le16_add_cpu(&path[i].p_hdr->eh_entries, -used); > + le16_add_cpu(&npath[i].p_hdr->eh_entries, used); > + err = ext4_ext_dirty(handle, inode, path + i); > + if (err) > + return err; > + err = ext4_ext_dirty(handle, inode, npath + i); > + if (err) > + return err; > + > + /* correct index above */ > + for (k = i; k > 0; k--) { > + err = ext4_ext_get_access(handle, inode, npath + k - 1); > + if (err) > + return err; > + npath[k-1].p_idx->ei_block = > + EXT_FIRST_INDEX(npath[k].p_hdr)->ei_block; > + err = ext4_ext_dirty(handle, inode, npath + k - 1); > + if (err) > + return err; > + } > + } > + > + /* > + * TODO: given we've got two paths, it should be possible to > + * collapse those two blocks into the root one in some cases > + */ > + return 1; > +} > + > +static int ext4_ext_try_to_merge_blocks(handle_t *handle, > + struct inode *inode, > + struct ext4_ext_path *path) > +{ > + struct ext4_ext_path *npath = NULL; > + unsigned int depth = ext_depth(inode); > + ext4_lblk_t next; > + int used, rc = 0; > + > + if (depth == 0) > + return 0; > + > + used = le16_to_cpu(path[depth].p_hdr->eh_entries); > + /* don't be too agressive as checking space in > + * the next block is not free */ > + if (used > ext4_ext_space_block(inode, 0) / 4) > + return 0; > + > + /* try to merge to the next block */ > + next = ext4_ext_next_leaf_block(path); > + if (next == EXT_MAX_BLOCKS) > + return 0; > + npath = ext4_find_extent(inode, next, NULL, 0); > + if (IS_ERR(npath)) > + return 0; > + rc = ext4_ext_merge_blocks(handle, inode, path, npath); > + ext4_ext_drop_refs(npath); > + kfree(npath); > + if (rc) > + return rc > 0 ? 0 : rc; > + > + /* try to merge with the previous block */ > + if (EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block == 0) > + return 0; > + next = EXT_FIRST_EXTENT(path[depth].p_hdr)->ee_block - 1; > + npath = ext4_find_extent(inode, next, NULL, 0); > + if (IS_ERR(npath)) > + return 0; > + rc = ext4_ext_merge_blocks(handle, inode, npath, path); > + ext4_ext_drop_refs(npath); > + kfree(npath); > + return rc > 0 ? 0 : rc; > } > > /* > @@ -1976,6 +2145,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > int depth, len, err; > ext4_lblk_t next; > int mb_flags = 0, unwritten; > + int merged = 0; > > if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) > mb_flags |= EXT4_MB_DELALLOC_RESERVED; > @@ -2167,8 +2337,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > merge: > /* try to merge extents */ > if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) > - ext4_ext_try_to_merge(handle, inode, path, nearex); > - > + merged = ext4_ext_try_to_merge(handle, inode, path, nearex); > > /* time to correct all indexes above */ > err = ext4_ext_correct_indexes(handle, inode, path); > @@ -2176,6 +2345,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, > goto cleanup; > > err = ext4_ext_dirty(handle, inode, path + path->p_depth); > + if (!err && merged) > + err = ext4_ext_try_to_merge_blocks(handle, inode, path); > > cleanup: > ext4_free_ext_path(npath); > @@ -3766,7 +3937,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, > /* note: ext4_ext_correct_indexes() isn't needed here because > * borders are not changed > */ > - ext4_ext_try_to_merge(handle, inode, path, ex); > + if (ext4_ext_try_to_merge(handle, inode, path, ex)) > + ext4_ext_try_to_merge_blocks(handle, inode, path); > > /* Mark modified extent as dirty */ > err = ext4_ext_dirty(handle, inode, path + path->p_depth); > @@ -3829,7 +4001,8 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, > /* note: ext4_ext_correct_indexes() isn't needed here because > * borders are not changed > */ > - ext4_ext_try_to_merge(handle, inode, path, ex); > + if (ext4_ext_try_to_merge(handle, inode, path, ex)) > + ext4_ext_try_to_merge_blocks(handle, inode, path); > > /* Mark modified extent as dirty */ > err = ext4_ext_dirty(handle, inode, path + path->p_depth); > diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c > index 18611241f451..7421f2af9cf2 100644 > --- a/fs/jbd2/transaction.c > +++ b/fs/jbd2/transaction.c > @@ -513,6 +513,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, > } > rsv_handle->h_reserved = 1; > rsv_handle->h_journal = journal; > + rsv_handle->h_revoke_credits = revoke_records; > handle->h_rsv_handle = rsv_handle; > } > handle->h_revoke_credits = revoke_records; >