In HMZONED mode, align the device extents to zone boundaries so that a zone reset affects only the device extent and does not change the state of blocks in the neighbor device extents. Also, check that a region allocation is always over empty zones and it is not over any locations of super block zones. Signed-off-by: Naohiro Aota <naohiro.aota@xxxxxxx> --- common/hmzoned.c | 70 ++++++++++++++++++++++++++++++++++++++++++++ common/hmzoned.h | 23 +++++++++++++++ kerncompat.h | 2 ++ volumes.c | 76 +++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 163 insertions(+), 8 deletions(-) diff --git a/common/hmzoned.c b/common/hmzoned.c index 5080bd7dea5b..2cbf2fc88cb0 100644 --- a/common/hmzoned.c +++ b/common/hmzoned.c @@ -24,6 +24,8 @@ #include "common/messages.h" #include "mkfs/common.h" #include "common/hmzoned.h" +#include "volumes.h" +#include "disk-io.h" #define BTRFS_REPORT_NR_ZONES 8192 @@ -435,6 +437,74 @@ size_t btrfs_sb_io(int fd, void *buf, off_t offset, int rw) return ret_sz; } +static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + unsigned int zno; + + if (!zone_is_sequential(zinfo, pos)) + return true; + + zno = pos / zinfo->zone_size; + return zinfo->zones[zno].cond == BLK_ZONE_COND_EMPTY; +} + +/* + * btrfs_check_allocatable_zones - check if spcecifeid region is + * suitable for allocation + * @device: the device to allocate a region + * @pos: the position of the region + * @num_bytes: the size of the region + * + * In non-ZONED device, anywhere is suitable for allocation. In ZONED + * device, check if + * 1) the region is not on non-empty sequential zones, + * 2) all zones in the region have the same zone type, + * 3) it does not contain super block location + */ +bool btrfs_check_allocatable_zones(struct btrfs_device *device, u64 pos, + u64 num_bytes) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + u64 nzones, begin, end; + u64 sb_pos; + bool is_sequential; + int i; + + if (!zinfo || zinfo->model == ZONED_NONE) + return true; + + nzones = num_bytes / zinfo->zone_size; + begin = pos / zinfo->zone_size; + end = begin + nzones; + + ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + + if (end > zinfo->nr_zones) + return false; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + sb_pos = sb_zone_number(zinfo->zone_size, i); + if (!(end < sb_pos || sb_pos + 1 < begin)) + return false; + } + + is_sequential = btrfs_dev_is_sequential(device, pos); + + while (num_bytes) { + if (is_sequential && !btrfs_dev_is_empty_zone(device, pos)) + return false; + if (is_sequential != btrfs_dev_is_sequential(device, pos)) + return false; + + pos += zinfo->zone_size; + num_bytes -= zinfo->zone_size; + } + + return true; +} + #endif int btrfs_get_zone_info(int fd, const char *file, bool hmzoned, diff --git a/common/hmzoned.h b/common/hmzoned.h index 920f992dbb93..3444e2c1b0f5 100644 --- a/common/hmzoned.h +++ b/common/hmzoned.h @@ -19,6 +19,7 @@ #define __BTRFS_HMZONED_H__ #include <stdbool.h> +#include "volumes.h" #ifdef BTRFS_ZONED #include <linux/blkzoned.h> @@ -67,6 +68,8 @@ static inline size_t sbwrite(int fd, void *buf, off_t offset) return btrfs_sb_io(fd, buf, offset, WRITE); } int btrfs_wipe_sb_zones(int fd, struct btrfs_zoned_device_info *zinfo); +bool btrfs_check_allocatable_zones(struct btrfs_device *device, u64 pos, + u64 num_bytes); #else static inline bool zone_is_sequential(struct btrfs_zoned_device_info *zinfo, u64 bytenr) @@ -97,6 +100,26 @@ static inline int btrfs_wipe_sb_zones(int fd, { return 0; } +static inline bool btrfs_check_allocatable_zones(struct btrfs_device *device, + u64 pos, u64 num_bytes) +{ + return true; +} + #endif /* BTRFS_ZONED */ +static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) +{ + return zone_is_sequential(device->zone_info, pos); +} +static inline u64 btrfs_zone_align(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + + if (!zinfo || zinfo->model == ZONED_NONE) + return pos; + + return ALIGN(pos, zinfo->zone_size); +} + #endif /* __BTRFS_HMZONED_H__ */ diff --git a/kerncompat.h b/kerncompat.h index c38643437747..58cdcf921c5e 100644 --- a/kerncompat.h +++ b/kerncompat.h @@ -28,6 +28,7 @@ #include <assert.h> #include <stddef.h> #include <linux/types.h> +#include <linux/kernel.h> #include <stdint.h> #include <features.h> @@ -354,6 +355,7 @@ static inline void assert_trace(const char *assertion, const char *filename, /* Alignment check */ #define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) static inline int is_power_of_2(unsigned long n) { diff --git a/volumes.c b/volumes.c index d92052e19330..148169d5b2a2 100644 --- a/volumes.c +++ b/volumes.c @@ -496,6 +496,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device, int slot; struct extent_buffer *l; u64 min_search_start; + u64 zone_size = 0; /* * We don't want to overwrite the superblock on the drive nor any area @@ -504,6 +505,14 @@ static int find_free_dev_extent_start(struct btrfs_device *device, */ min_search_start = max(root->fs_info->alloc_start, (u64)SZ_1M); search_start = max(search_start, min_search_start); + /* + * For a zoned block device, skip the first zone of the device + * entirely. + */ + if (device->zone_info) + zone_size = device->zone_info->zone_size; + search_start = max_t(u64, search_start, zone_size); + search_start = btrfs_zone_align(device, search_start); path = btrfs_alloc_path(); if (!path) @@ -512,6 +521,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device, max_hole_start = search_start; max_hole_size = 0; +again: if (search_start >= search_end) { ret = -ENOSPC; goto out; @@ -556,6 +566,13 @@ static int find_free_dev_extent_start(struct btrfs_device *device, goto next; if (key.offset > search_start) { + if (!btrfs_check_allocatable_zones(device, search_start, + num_bytes)) { + search_start += zone_size; + btrfs_release_path(path); + goto again; + } + hole_size = key.offset - search_start; /* @@ -598,6 +615,13 @@ next: * search_end may be smaller than search_start. */ if (search_end > search_start) { + if (!btrfs_check_allocatable_zones(device, search_start, + num_bytes)) { + search_start += zone_size; + btrfs_release_path(path); + goto again; + } + hole_size = search_end - search_start; if (hole_size > max_hole_size) { @@ -613,6 +637,7 @@ next: ret = 0; out: + ASSERT(zone_size == 0 || IS_ALIGNED(max_hole_start, zone_size)); btrfs_free_path(path); *start = max_hole_start; if (len) @@ -641,6 +666,11 @@ int btrfs_insert_dev_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; + /* Check alignment to zone for a zoned block device */ + ASSERT(!device->zone_info || + device->zone_info->model != ZONED_HOST_MANAGED || + IS_ALIGNED(start, device->zone_info->zone_size)); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1045,17 +1075,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int max_stripes = 0; int min_stripes = 1; int sub_stripes = 1; - int dev_stripes __attribute__((unused)); - /* stripes per dev */ + int dev_stripes; /* stripes per dev */ int devs_max; /* max devs to use */ - int devs_min __attribute__((unused)); - /* min devs needed */ + int devs_min; /* min devs needed */ int devs_increment __attribute__((unused)); /* ndevs has to be a multiple of this */ - int ncopies __attribute__((unused)); - /* how many copies to data has */ - int nparity __attribute__((unused)); - /* number of stripes worth of bytes to + int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to store parity information */ int looped = 0; int ret; @@ -1063,6 +1089,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int stripe_len = BTRFS_STRIPE_LEN; struct btrfs_key key; u64 offset; + bool hmzoned = info->fs_devices->hmzoned; + u64 zone_size = info->fs_devices->zone_size; if (list_empty(dev_list)) { return -ENOSPC; @@ -1163,13 +1191,40 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_super_stripesize(info->super_copy)); } + if (hmzoned) { + calc_size = zone_size; + max_chunk_size = max(max_chunk_size, zone_size); + max_chunk_size = round_down(max_chunk_size, zone_size); + } + /* we don't want a chunk larger than 10% of the FS */ percent_max = div_factor(btrfs_super_total_bytes(info->super_copy), 1); max_chunk_size = min(percent_max, max_chunk_size); + if (hmzoned) { + int min_num_stripes = devs_min * dev_stripes; + int min_data_stripes = (min_num_stripes - nparity) / ncopies; + u64 min_chunk_size = min_data_stripes * zone_size; + + max_chunk_size = max(round_down(max_chunk_size, + zone_size), + min_chunk_size); + } + again: if (chunk_bytes_by_type(type, calc_size, num_stripes, sub_stripes) > max_chunk_size) { + if (hmzoned) { + /* + * calc_size is fixed in HMZONED. Reduce + * num_stripes instead. + */ + num_stripes = max_chunk_size * ncopies / calc_size; + if (num_stripes < min_stripes) + return -ENOSPC; + goto again; + } + calc_size = max_chunk_size; calc_size /= num_stripes; calc_size /= stripe_len; @@ -1180,6 +1235,9 @@ again: calc_size /= stripe_len; calc_size *= stripe_len; + + ASSERT(!hmzoned || calc_size == zone_size); + INIT_LIST_HEAD(&private_devs); cur = dev_list->next; index = 0; @@ -1261,6 +1319,8 @@ again: if (ret < 0) goto out_chunk_map; + ASSERT(!zone_size || IS_ALIGNED(dev_offset, zone_size)); + device->bytes_used += calc_size; ret = btrfs_update_device(trans, device); if (ret < 0) -- 2.24.0