On Mon, Jan 04, 2021 at 03:39:50PM -0500, Josef Bacik wrote: > Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in > shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing > some infrastructure we have in place to flush inodes that we use for > device replace and snapshot. However this introduced a pretty serious > performance regression. To reproduce the user untarred the source > tarball of Firefox, and would see it take anywhere from 5 to 20 times as > long to untar in 5.10 compared to 5.9. > > The root cause is because before we would generally use the normal > writeback path to reclaim delalloc space, and for this we would provide > it with the number of pages we wanted to flush. The referenced commit > changed this to flush that many inodes, which drastically increased the > amount of space we were flushing in certain cases, which severely > affected performance. > > We cannot revert this patch unfortunately, because Filipe has another > fix that requires the ability to skip flushing inodes that are being Which fix is that? Commit id or at last the subject. > cloned in certain scenarios, which means we need to keep using our > flushing infrastructure or risk re-introducing the deadlock. > > Instead to fix this problem we can go back to providing > btrfs_start_delalloc_roots with a number of pages to flush, and then set > up a writeback_control and utilize sync_inode() to handle the flushing > for us. This gives us the same behavior we had prior to the fix, while > still allowing us to avoid the deadlock that was fixed by Filipe. I > redid the users original test and got the following results > > 5.9 0m54.258s > 5.10 1m26.212s > Patched 0m38.800s Patched is on which base? Also the hardware type seems to be a significant factor. I've been testing that on some old SSD, the time difference was about 5 minutes vs 1, both on 5.11-rc2 and 5.10.5. I'll add my results to the final version of the patch. > We are significantly faster because of the work I did around improving > ENOSPC flushing in 5.10 and 5.11, so reverting to the previous write out > behavior gave us a pretty big boost. The reference to the enospc needs to be less vague if you mention 2 versions. > CC: stable@xxxxxxxxxxxxxxx # 5.10 > Reported-by: René Rebe <rene@xxxxxxxxxxxx> > Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc") > Signed-off-by: Josef Bacik <josef@xxxxxxxxxxxxxx> > --- > v1->v2: > - Explicitly state what the regression was in the commit message. > > fs/btrfs/inode.c | 60 +++++++++++++++++++++++++++++++------------ > fs/btrfs/space-info.c | 4 ++- > 2 files changed, 46 insertions(+), 18 deletions(-) > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 070716650df8..a8e0a6b038d3 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -9390,7 +9390,8 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode > * some fairly slow code that needs optimization. This walks the list > * of all the inodes with pending delalloc and forces them to disk. > */ > -static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot, > +static int start_delalloc_inodes(struct btrfs_root *root, > + struct writeback_control *wbc, bool snapshot, > bool in_reclaim_context) > { > struct btrfs_inode *binode; > @@ -9399,6 +9400,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot > struct list_head works; > struct list_head splice; > int ret = 0; > + bool full_flush = wbc->nr_to_write == LONG_MAX; > > INIT_LIST_HEAD(&works); > INIT_LIST_HEAD(&splice); > @@ -9427,18 +9429,24 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot > if (snapshot) > set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, > &binode->runtime_flags); > - work = btrfs_alloc_delalloc_work(inode); > - if (!work) { > - iput(inode); > - ret = -ENOMEM; > - goto out; > - } > - list_add_tail(&work->list, &works); > - btrfs_queue_work(root->fs_info->flush_workers, > - &work->work); > - if (*nr != U64_MAX) { > - (*nr)--; > - if (*nr == 0) > + if (full_flush) { > + work = btrfs_alloc_delalloc_work(inode); > + if (!work) { > + iput(inode); > + ret = -ENOMEM; > + goto out; > + } > + list_add_tail(&work->list, &works); > + btrfs_queue_work(root->fs_info->flush_workers, > + &work->work); > + } else { > + ret = sync_inode(inode, wbc); > + if (!ret && > + test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, > + &BTRFS_I(inode)->runtime_flags)) > + ret = sync_inode(inode, wbc); > + btrfs_add_delayed_iput(inode); > + if (ret || wbc->nr_to_write <= 0) > goto out; > } > cond_resched(); > @@ -9464,18 +9472,29 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot > > int btrfs_start_delalloc_snapshot(struct btrfs_root *root) > { > + struct writeback_control wbc = { > + .nr_to_write = LONG_MAX, > + .sync_mode = WB_SYNC_NONE, > + .range_start = 0, > + .range_end = LLONG_MAX, > + }; > struct btrfs_fs_info *fs_info = root->fs_info; > - u64 nr = U64_MAX; > > if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) > return -EROFS; > > - return start_delalloc_inodes(root, &nr, true, false); > + return start_delalloc_inodes(root, &wbc, true, false); > } > > int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, > bool in_reclaim_context) > { > + struct writeback_control wbc = { > + .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr, > + .sync_mode = WB_SYNC_NONE, > + .range_start = 0, > + .range_end = LLONG_MAX, > + }; > struct btrfs_root *root; > struct list_head splice; > int ret; > @@ -9489,6 +9508,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, > spin_lock(&fs_info->delalloc_root_lock); > list_splice_init(&fs_info->delalloc_roots, &splice); > while (!list_empty(&splice) && nr) { > + /* > + * Reset nr_to_write here so we know that we're doing a full > + * flush. > + */ > + if (nr == U64_MAX) > + wbc.nr_to_write = LONG_MAX; > + > root = list_first_entry(&splice, struct btrfs_root, > delalloc_root); > root = btrfs_grab_root(root); > @@ -9497,9 +9523,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, > &fs_info->delalloc_roots); > spin_unlock(&fs_info->delalloc_root_lock); > > - ret = start_delalloc_inodes(root, &nr, false, in_reclaim_context); > + ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); > btrfs_put_root(root); > - if (ret < 0) > + if (ret < 0 || wbc.nr_to_write <= 0) > goto out; > spin_lock(&fs_info->delalloc_root_lock); > } > diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c > index 67e55c5479b8..e8347461c8dd 100644 > --- a/fs/btrfs/space-info.c > +++ b/fs/btrfs/space-info.c > @@ -532,7 +532,9 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, > > loops = 0; > while ((delalloc_bytes || dio_bytes) && loops < 3) { > - btrfs_start_delalloc_roots(fs_info, items, true); > + u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; > + > + btrfs_start_delalloc_roots(fs_info, nr_pages, true); > > loops++; > if (wait_ordered && !trans) { > -- > 2.26.2