Re: [PATCH v14 12/42] btrfs: calculate allocation offset for conventional zones

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2021/02/03 15:58, Anand Jain wrote:
> 
> 
> On 2/3/2021 2:10 PM, Damien Le Moal wrote:
>> On 2021/02/03 14:22, Anand Jain wrote:
>>> On 1/26/2021 10:24 AM, Naohiro Aota wrote:
>>>> Conventional zones do not have a write pointer, so we cannot use it to
>>>> determine the allocation offset if a block group contains a conventional
>>>> zone.
>>>>
>>>> But instead, we can consider the end of the last allocated extent in the
>>>> block group as an allocation offset.
>>>>
>>>> For new block group, we cannot calculate the allocation offset by
>>>> consulting the extent tree, because it can cause deadlock by taking extent
>>>> buffer lock after chunk mutex (which is already taken in
>>>> btrfs_make_block_group()). Since it is a new block group, we can simply set
>>>> the allocation offset to 0, anyway.
>>>>
>>>
>>> Information about how are the WP of conventional zones used is missing here.
>>
>> Conventional zones do not have valid write pointers because they can be written
>> randomly. This is per ZBC/ZAC specifications. So the wp info is not used, as
>> stated at the beginning of the commit message.
> 
> I was looking for the information why still "end of the last allocated 
> extent in the block group" is assigned to it?

We wanted to keep sequential allocation even for conventional zones, to have a
coherent allocation policy for all groups instead of different policies for
different zone types. Hence the "last allocated extent" used as a replacement
for non-existent wp of conventional zones. we could revisit this, but I do like
the single allocation policy approach as that isolate, somewhat, zone types from
the block group mapping to zones. There is probably room for improvements in
this area though.

> 
> Thanks.
> 
>>> Reviewed-by: Anand Jain <anand.jain@xxxxxxxxxx>
>>> Thanks.
>>>
>>>> Signed-off-by: Naohiro Aota <naohiro.aota@xxxxxxx>
>>>> ---
>>>>    fs/btrfs/block-group.c |  4 +-
>>>>    fs/btrfs/zoned.c       | 99 +++++++++++++++++++++++++++++++++++++++---
>>>>    fs/btrfs/zoned.h       |  4 +-
>>>>    3 files changed, 98 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>>>> index 0140fafedb6a..349b2a09bdf1 100644
>>>> --- a/fs/btrfs/block-group.c
>>>> +++ b/fs/btrfs/block-group.c
>>>> @@ -1851,7 +1851,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
>>>>    			goto error;
>>>>    	}
>>>>    
>>>> -	ret = btrfs_load_block_group_zone_info(cache);
>>>> +	ret = btrfs_load_block_group_zone_info(cache, false);
>>>>    	if (ret) {
>>>>    		btrfs_err(info, "zoned: failed to load zone info of bg %llu",
>>>>    			  cache->start);
>>>> @@ -2146,7 +2146,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
>>>>    	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
>>>>    		cache->needs_free_space = 1;
>>>>    
>>>> -	ret = btrfs_load_block_group_zone_info(cache);
>>>> +	ret = btrfs_load_block_group_zone_info(cache, true);
>>>>    	if (ret) {
>>>>    		btrfs_put_block_group(cache);
>>>>    		return ret;
>>>> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
>>>> index 22c0665ee816..ca7aef252d33 100644
>>>> --- a/fs/btrfs/zoned.c
>>>> +++ b/fs/btrfs/zoned.c
>>>> @@ -930,7 +930,68 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
>>>>    	return 0;
>>>>    }
>>>>    
>>>> -int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
>>>> +/*
>>>> + * Calculate an allocation pointer from the extent allocation information
>>>> + * for a block group consist of conventional zones. It is pointed to the
>>>> + * end of the last allocated extent in the block group as an allocation
>>>> + * offset.
>>>> + */
>>>> +static int calculate_alloc_pointer(struct btrfs_block_group *cache,
>>>> +				   u64 *offset_ret)
>>>> +{
>>>> +	struct btrfs_fs_info *fs_info = cache->fs_info;
>>>> +	struct btrfs_root *root = fs_info->extent_root;
>>>> +	struct btrfs_path *path;
>>>> +	struct btrfs_key key;
>>>> +	struct btrfs_key found_key;
>>>> +	int ret;
>>>> +	u64 length;
>>>> +
>>>> +	path = btrfs_alloc_path();
>>>> +	if (!path)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	key.objectid = cache->start + cache->length;
>>>> +	key.type = 0;
>>>> +	key.offset = 0;
>>>> +
>>>> +	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
>>>> +	/* We should not find the exact match */
>>>> +	if (!ret)
>>>> +		ret = -EUCLEAN;
>>>> +	if (ret < 0)
>>>> +		goto out;
>>>> +
>>>> +	ret = btrfs_previous_extent_item(root, path, cache->start);
>>>> +	if (ret) {
>>>> +		if (ret == 1) {
>>>> +			ret = 0;
>>>> +			*offset_ret = 0;
>>>> +		}
>>>> +		goto out;
>>>> +	}
>>>> +
>>>> +	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
>>>> +
>>>> +	if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
>>>> +		length = found_key.offset;
>>>> +	else
>>>> +		length = fs_info->nodesize;
>>>> +
>>>> +	if (!(found_key.objectid >= cache->start &&
>>>> +	       found_key.objectid + length <= cache->start + cache->length)) {
>>>> +		ret = -EUCLEAN;
>>>> +		goto out;
>>>> +	}
>>>> +	*offset_ret = found_key.objectid + length - cache->start;
>>>> +	ret = 0;
>>>> +
>>>> +out:
>>>> +	btrfs_free_path(path);
>>>> +	return ret;
>>>> +}
>>>> +
>>>> +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
>>>>    {
>>>>    	struct btrfs_fs_info *fs_info = cache->fs_info;
>>>>    	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
>>>> @@ -944,6 +1005,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
>>>>    	int i;
>>>>    	unsigned int nofs_flag;
>>>>    	u64 *alloc_offsets = NULL;
>>>> +	u64 last_alloc = 0;
>>>>    	u32 num_sequential = 0, num_conventional = 0;
>>>>    
>>>>    	if (!btrfs_is_zoned(fs_info))
>>>> @@ -1042,11 +1104,30 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
>>>>    
>>>>    	if (num_conventional > 0) {
>>>>    		/*
>>>> -		 * Since conventional zones do not have a write pointer, we
>>>> -		 * cannot determine alloc_offset from the pointer
>>>> +		 * Avoid calling calculate_alloc_pointer() for new BG. It
>>>> +		 * is no use for new BG. It must be always 0.
>>>> +		 *
>>>> +		 * Also, we have a lock chain of extent buffer lock ->
>>>> +		 * chunk mutex.  For new BG, this function is called from
>>>> +		 * btrfs_make_block_group() which is already taking the
>>>> +		 * chunk mutex. Thus, we cannot call
>>>> +		 * calculate_alloc_pointer() which takes extent buffer
>>>> +		 * locks to avoid deadlock.
>>>>    		 */
>>>> -		ret = -EINVAL;
>>>> -		goto out;
>>>> +		if (new) {
>>>> +			cache->alloc_offset = 0;
>>>> +			goto out;
>>>> +		}
>>>> +		ret = calculate_alloc_pointer(cache, &last_alloc);
>>>> +		if (ret || map->num_stripes == num_conventional) {
>>>> +			if (!ret)
>>>> +				cache->alloc_offset = last_alloc;
>>>> +			else
>>>> +				btrfs_err(fs_info,
>>>> +			"zoned: failed to determine allocation offset of bg %llu",
>>>> +					  cache->start);
>>>> +			goto out;
>>>> +		}
>>>>    	}
>>>>    
>>>>    	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
>>>> @@ -1068,6 +1149,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache)
>>>>    	}
>>>>    
>>>>    out:
>>>> +	/* An extent is allocated after the write pointer */
>>>> +	if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
>>>> +		btrfs_err(fs_info,
>>>> +			  "zoned: got wrong write pointer in BG %llu: %llu > %llu",
>>>> +			  logical, last_alloc, cache->alloc_offset);
>>>> +		ret = -EIO;
>>>> +	}
>>>> +
>>>>    	kfree(alloc_offsets);
>>>>    	free_extent_map(em);
>>>>    
>>>> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
>>>> index 491b98c97f48..b53403ba0b10 100644
>>>> --- a/fs/btrfs/zoned.h
>>>> +++ b/fs/btrfs/zoned.h
>>>> @@ -41,7 +41,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
>>>>    int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
>>>>    			    u64 length, u64 *bytes);
>>>>    int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
>>>> -int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache);
>>>> +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
>>>>    #else /* CONFIG_BLK_DEV_ZONED */
>>>>    static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
>>>>    				     struct blk_zone *zone)
>>>> @@ -119,7 +119,7 @@ static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
>>>>    }
>>>>    
>>>>    static inline int btrfs_load_block_group_zone_info(
>>>> -	struct btrfs_block_group *cache)
>>>> +	struct btrfs_block_group *cache, bool new)
>>>>    {
>>>>    	return 0;
>>>>    }
>>>>
>>>
>>>
>>
>>
> 


-- 
Damien Le Moal
Western Digital Research




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux