To preserve sequential write pattern on the drives, we must serialize allocation and submit_bio. This commit add per-block group mutex "zone_io_lock" and find_free_extent_seq() hold the lock. The lock is kept even after returning from find_free_extent(). It is released when submiting IOs corresponding to the allocation is completed. Implementing such behavior under __extent_writepage_io is almost impossible because once pages are unlocked we are not sure when submiting IOs for an allocated region is finished or not. Instead, this commit add run_delalloc_hmzoned() to write out non-compressed data IOs at once using extent_write_locked_rage(). After the write, we can call btrfs_hmzoned_unlock_allocation() to unlock the block group for new allocation. Signed-off-by: Naohiro Aota <naohiro.aota@xxxxxxx> --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 5 +++++ fs/btrfs/hmzoned.h | 34 +++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 45 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 83 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3d31a1960c4d..1e924c0d1210 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -619,6 +619,7 @@ struct btrfs_block_group_cache { * zone. */ u64 alloc_offset; + struct mutex zone_io_lock; }; /* delayed seq elem */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bc95a73a762d..5b1a9e607555 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5532,6 +5532,7 @@ static int find_free_extent_seq(struct btrfs_block_group_cache *cache, if (cache->alloc_type != BTRFS_ALLOC_SEQ) return 1; + btrfs_hmzoned_data_io_lock(cache); spin_lock(&space_info->lock); spin_lock(&cache->lock); @@ -5563,6 +5564,9 @@ static int find_free_extent_seq(struct btrfs_block_group_cache *cache, out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); + /* if succeeds, unlock after submit_bio */ + if (ret) + btrfs_hmzoned_data_io_unlock(cache); return ret; } @@ -8104,6 +8108,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); mutex_init(&cache->free_space_lock); + mutex_init(&cache->zone_io_lock); btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); cache->alloc_type = BTRFS_ALLOC_FIT; diff --git a/fs/btrfs/hmzoned.h b/fs/btrfs/hmzoned.h index 3a73c3c5e1da..a8e7286708d4 100644 --- a/fs/btrfs/hmzoned.h +++ b/fs/btrfs/hmzoned.h @@ -39,6 +39,7 @@ int btrfs_hmzoned_check_metadata_space(struct btrfs_fs_info *fs_info); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); +void btrfs_hmzoned_data_io_unlock_at(struct inode *inode, u64 start, u64 len); static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) { @@ -140,4 +141,37 @@ static inline bool btrfs_check_super_location(struct btrfs_device *device, !btrfs_dev_is_sequential(device, pos); } + +static inline void btrfs_hmzoned_data_io_lock( + struct btrfs_block_group_cache *cache) +{ + /* No need to lock metadata BGs or non-sequential BGs */ + if (!(cache->flags & BTRFS_BLOCK_GROUP_DATA) || + cache->alloc_type != BTRFS_ALLOC_SEQ) + return; + mutex_lock(&cache->zone_io_lock); +} + +static inline void btrfs_hmzoned_data_io_unlock( + struct btrfs_block_group_cache *cache) +{ + if (!(cache->flags & BTRFS_BLOCK_GROUP_DATA) || + cache->alloc_type != BTRFS_ALLOC_SEQ) + return; + mutex_unlock(&cache->zone_io_lock); +} + +static inline void btrfs_hmzoned_data_io_unlock_logical( + struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group_cache *cache; + + if (!btrfs_fs_incompat(fs_info, HMZONED)) + return; + + cache = btrfs_lookup_block_group(fs_info, logical); + btrfs_hmzoned_data_io_unlock(cache); + btrfs_put_block_group(cache); +} + #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ee582a36653d..d504200c9767 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -48,6 +48,7 @@ #include "qgroup.h" #include "dedupe.h" #include "delalloc-space.h" +#include "hmzoned.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -1279,6 +1280,39 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, return 0; } +static noinline int run_delalloc_hmzoned(struct inode *inode, + struct page *locked_page, u64 start, + u64 end, int *page_started, + unsigned long *nr_written) +{ + struct extent_map *em; + u64 logical; + int ret; + + ret = cow_file_range(inode, locked_page, start, end, + end, page_started, nr_written, 0, NULL); + if (ret) + return ret; + + if (*page_started) + return 0; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, end - start + 1, + 0); + ASSERT(em != NULL && em->block_start < EXTENT_MAP_LAST_BYTE); + logical = em->block_start; + free_extent_map(em); + + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + extent_write_locked_range(inode, start, end, WB_SYNC_ALL); + *page_started = 1; + + btrfs_hmzoned_data_io_unlock_logical(btrfs_sb(inode->i_sb), logical); + + return 0; +} + static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { @@ -1645,17 +1679,24 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; int force_cow = need_force_cow(inode, start, end); unsigned int write_flags = wbc_to_write_flags(wbc); + int do_compress = inode_can_compress(inode) && + inode_need_compress(inode, start, end); + int hmzoned = btrfs_fs_incompat(btrfs_sb(inode->i_sb), HMZONED); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { + ASSERT(!hmzoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { + ASSERT(!hmzoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - } else if (!inode_can_compress(inode) || - !inode_need_compress(inode, start, end)) { + } else if (!do_compress && !hmzoned) { ret = cow_file_range(inode, locked_page, start, end, end, page_started, nr_written, 1, NULL); + } else if (!do_compress && hmzoned) { + ret = run_delalloc_hmzoned(inode, locked_page, start, end, + page_started, nr_written); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); -- 2.22.0