On 2021/02/03 15:58, Anand Jain wrote: > > > On 2/3/2021 2:10 PM, Damien Le Moal wrote: >> On 2021/02/03 14:22, Anand Jain wrote: >>> On 1/26/2021 10:24 AM, Naohiro Aota wrote: >>>> Conventional zones do not have a write pointer, so we cannot use it to >>>> determine the allocation offset if a block group contains a conventional >>>> zone. >>>> >>>> But instead, we can consider the end of the last allocated extent in the >>>> block group as an allocation offset. >>>> >>>> For new block group, we cannot calculate the allocation offset by >>>> consulting the extent tree, because it can cause deadlock by taking extent >>>> buffer lock after chunk mutex (which is already taken in >>>> btrfs_make_block_group()). Since it is a new block group, we can simply set >>>> the allocation offset to 0, anyway. >>>> >>> >>> Information about how are the WP of conventional zones used is missing here. >> >> Conventional zones do not have valid write pointers because they can be written >> randomly. This is per ZBC/ZAC specifications. So the wp info is not used, as >> stated at the beginning of the commit message. > > I was looking for the information why still "end of the last allocated > extent in the block group" is assigned to it? We wanted to keep sequential allocation even for conventional zones, to have a coherent allocation policy for all groups instead of different policies for different zone types. Hence the "last allocated extent" used as a replacement for non-existent wp of conventional zones. we could revisit this, but I do like the single allocation policy approach as that isolate, somewhat, zone types from the block group mapping to zones. There is probably room for improvements in this area though. > > Thanks. > >>> Reviewed-by: Anand Jain <anand.jain@xxxxxxxxxx> >>> Thanks. >>> >>>> Signed-off-by: Naohiro Aota <naohiro.aota@xxxxxxx> >>>> --- >>>> fs/btrfs/block-group.c | 4 +- >>>> fs/btrfs/zoned.c | 99 +++++++++++++++++++++++++++++++++++++++--- >>>> fs/btrfs/zoned.h | 4 +- >>>> 3 files changed, 98 insertions(+), 9 deletions(-) >>>> >>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c >>>> index 0140fafedb6a..349b2a09bdf1 100644 >>>> --- a/fs/btrfs/block-group.c >>>> +++ b/fs/btrfs/block-group.c >>>> @@ -1851,7 +1851,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, >>>> goto error; >>>> } >>>> >>>> - ret = btrfs_load_block_group_zone_info(cache); >>>> + ret = btrfs_load_block_group_zone_info(cache, false); >>>> if (ret) { >>>> btrfs_err(info, "zoned: failed to load zone info of bg %llu", >>>> cache->start); >>>> @@ -2146,7 +2146,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, >>>> if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) >>>> cache->needs_free_space = 1; >>>> >>>> - ret = btrfs_load_block_group_zone_info(cache); >>>> + ret = btrfs_load_block_group_zone_info(cache, true); >>>> if (ret) { >>>> btrfs_put_block_group(cache); >>>> return ret; >>>> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c >>>> index 22c0665ee816..ca7aef252d33 100644 >>>> --- a/fs/btrfs/zoned.c >>>> +++ b/fs/btrfs/zoned.c >>>> @@ -930,7 +930,68 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) >>>> return 0; >>>> } >>>> >>>> -int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) >>>> +/* >>>> + * Calculate an allocation pointer from the extent allocation information >>>> + * for a block group consist of conventional zones. It is pointed to the >>>> + * end of the last allocated extent in the block group as an allocation >>>> + * offset. >>>> + */ >>>> +static int calculate_alloc_pointer(struct btrfs_block_group *cache, >>>> + u64 *offset_ret) >>>> +{ >>>> + struct btrfs_fs_info *fs_info = cache->fs_info; >>>> + struct btrfs_root *root = fs_info->extent_root; >>>> + struct btrfs_path *path; >>>> + struct btrfs_key key; >>>> + struct btrfs_key found_key; >>>> + int ret; >>>> + u64 length; >>>> + >>>> + path = btrfs_alloc_path(); >>>> + if (!path) >>>> + return -ENOMEM; >>>> + >>>> + key.objectid = cache->start + cache->length; >>>> + key.type = 0; >>>> + key.offset = 0; >>>> + >>>> + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); >>>> + /* We should not find the exact match */ >>>> + if (!ret) >>>> + ret = -EUCLEAN; >>>> + if (ret < 0) >>>> + goto out; >>>> + >>>> + ret = btrfs_previous_extent_item(root, path, cache->start); >>>> + if (ret) { >>>> + if (ret == 1) { >>>> + ret = 0; >>>> + *offset_ret = 0; >>>> + } >>>> + goto out; >>>> + } >>>> + >>>> + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); >>>> + >>>> + if (found_key.type == BTRFS_EXTENT_ITEM_KEY) >>>> + length = found_key.offset; >>>> + else >>>> + length = fs_info->nodesize; >>>> + >>>> + if (!(found_key.objectid >= cache->start && >>>> + found_key.objectid + length <= cache->start + cache->length)) { >>>> + ret = -EUCLEAN; >>>> + goto out; >>>> + } >>>> + *offset_ret = found_key.objectid + length - cache->start; >>>> + ret = 0; >>>> + >>>> +out: >>>> + btrfs_free_path(path); >>>> + return ret; >>>> +} >>>> + >>>> +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) >>>> { >>>> struct btrfs_fs_info *fs_info = cache->fs_info; >>>> struct extent_map_tree *em_tree = &fs_info->mapping_tree; >>>> @@ -944,6 +1005,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) >>>> int i; >>>> unsigned int nofs_flag; >>>> u64 *alloc_offsets = NULL; >>>> + u64 last_alloc = 0; >>>> u32 num_sequential = 0, num_conventional = 0; >>>> >>>> if (!btrfs_is_zoned(fs_info)) >>>> @@ -1042,11 +1104,30 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) >>>> >>>> if (num_conventional > 0) { >>>> /* >>>> - * Since conventional zones do not have a write pointer, we >>>> - * cannot determine alloc_offset from the pointer >>>> + * Avoid calling calculate_alloc_pointer() for new BG. It >>>> + * is no use for new BG. It must be always 0. >>>> + * >>>> + * Also, we have a lock chain of extent buffer lock -> >>>> + * chunk mutex. For new BG, this function is called from >>>> + * btrfs_make_block_group() which is already taking the >>>> + * chunk mutex. Thus, we cannot call >>>> + * calculate_alloc_pointer() which takes extent buffer >>>> + * locks to avoid deadlock. >>>> */ >>>> - ret = -EINVAL; >>>> - goto out; >>>> + if (new) { >>>> + cache->alloc_offset = 0; >>>> + goto out; >>>> + } >>>> + ret = calculate_alloc_pointer(cache, &last_alloc); >>>> + if (ret || map->num_stripes == num_conventional) { >>>> + if (!ret) >>>> + cache->alloc_offset = last_alloc; >>>> + else >>>> + btrfs_err(fs_info, >>>> + "zoned: failed to determine allocation offset of bg %llu", >>>> + cache->start); >>>> + goto out; >>>> + } >>>> } >>>> >>>> switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { >>>> @@ -1068,6 +1149,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) >>>> } >>>> >>>> out: >>>> + /* An extent is allocated after the write pointer */ >>>> + if (!ret && num_conventional && last_alloc > cache->alloc_offset) { >>>> + btrfs_err(fs_info, >>>> + "zoned: got wrong write pointer in BG %llu: %llu > %llu", >>>> + logical, last_alloc, cache->alloc_offset); >>>> + ret = -EIO; >>>> + } >>>> + >>>> kfree(alloc_offsets); >>>> free_extent_map(em); >>>> >>>> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h >>>> index 491b98c97f48..b53403ba0b10 100644 >>>> --- a/fs/btrfs/zoned.h >>>> +++ b/fs/btrfs/zoned.h >>>> @@ -41,7 +41,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, >>>> int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, >>>> u64 length, u64 *bytes); >>>> int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); >>>> -int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache); >>>> +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); >>>> #else /* CONFIG_BLK_DEV_ZONED */ >>>> static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, >>>> struct blk_zone *zone) >>>> @@ -119,7 +119,7 @@ static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, >>>> } >>>> >>>> static inline int btrfs_load_block_group_zone_info( >>>> - struct btrfs_block_group *cache) >>>> + struct btrfs_block_group *cache, bool new) >>>> { >>>> return 0; >>>> } >>>> >>> >>> >> >> > -- Damien Le Moal Western Digital Research