On 4/13/20 6:55 AM, Damien Le Moal wrote:
On 2020/04/09 15:45, Hannes Reinecke wrote:Implement handling for metadata version 2. The new metadata adds a label and UUID for the device mapper device, and additional UUID for the underlying block devices. It also allows for an additional regular drive to be used for emulating random access zones. The emulated zones will be placed logically in front of the zones from the zoned block device, causing the superblocks and metadata to be stored on that device. The first zone of the original zoned device will be used to hold another, tertiary copy of the metadata; this copy carries a generation number of 0 and is never updated; it's just used for identification. Signed-off-by: Hannes Reinecke <hare@xxxxxxx> --- drivers/md/dm-zoned-metadata.c | 277 +++++++++++++++++++++++++++------ drivers/md/dm-zoned-target.c | 126 ++++++++------- drivers/md/dm-zoned.h | 7 +- 3 files changed, 306 insertions(+), 104 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 554ff32abada..36a71f50165d 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -16,7 +16,7 @@ /* * Metadata version. */ -#define DMZ_META_VER 1 +#define DMZ_META_VER 2/** On-disk super block magic. @@ -69,8 +69,17 @@ struct dmz_super { /* Checksum */ __le32 crc; /* 48 */+ /* DM-Zoned label */+ u8 dmz_label[32]; /* 80 */ + + /* DM-Zoned UUID */ + u8 dmz_uuid[16]; /* 96 */ + + /* Device UUID */ + u8 dev_uuid[16]; /* 112 */ + /* Padding to full 512B sector */ - u8 reserved[464]; /* 512 */ + u8 reserved[400]; /* 512 */ };/*@@ -135,6 +144,8 @@ struct dmz_metadata { struct dmz_dev *dev;char devname[BDEVNAME_SIZE];+ char label[BDEVNAME_SIZE]; + uuid_t uuid;sector_t zone_bitmap_size;unsigned int zone_nr_bitmap_blocks; @@ -161,8 +172,9 @@ struct dmz_metadata { /* Zone information array */ struct dm_zone *zones;- struct dmz_sb sb[2];+ struct dmz_sb sb[3]; unsigned int mblk_primary; + unsigned int sb_version; u64 sb_gen; unsigned int min_nr_mblks; unsigned int max_nr_mblks; @@ -195,31 +207,56 @@ struct dmz_metadata { };#define dmz_zmd_info(zmd, format, args...) \- DMINFO("(%s): " format, (zmd)->devname, ## args) + DMINFO("(%s): " format, (zmd)->label, ## args)#define dmz_zmd_err(zmd, format, args...) \- DMERR("(%s): " format, (zmd)->devname, ## args) + DMERR("(%s): " format, (zmd)->label, ## args)#define dmz_zmd_warn(zmd, format, args...) \- DMWARN("(%s): " format, (zmd)->devname, ## args) + DMWARN("(%s): " format, (zmd)->label, ## args)#define dmz_zmd_debug(zmd, format, args...) \- DMDEBUG("(%s): " format, (zmd)->devname, ## args) + DMDEBUG("(%s): " format, (zmd)->label, ## args) /* * Various accessors */ +unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone) +{ + unsigned int zone_id; + + if (WARN_ON(!zone)) + return 0; + + zone_id = zone->id; + if (zmd->dev[0].zone_offset &&The name zone_offset is maybe a little confusing as one could think it is a sector offset. May be call that zone_id_offset ?
Yeah, right. I'm not utterly happy with the naming myself.
+ (zone_id >= zmd->dev[0].zone_offset)) + zone_id -= zmd->dev[0].zone_offset; + return zone_id; +} + sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone) { - return (sector_t)zone->id << zmd->zone_nr_sectors_shift; + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_sectors_shift; }sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone){ - return (sector_t)zone->id << zmd->zone_nr_blocks_shift; + unsigned int zone_id = dmz_dev_zone_id(zmd, zone); + + return (sector_t)zone_id << zmd->zone_nr_blocks_shift; }struct dmz_dev *dmz_zone_to_dev(struct dmz_metadata *zmd, struct dm_zone *zone){ + if (WARN_ON(!zone)) + return &zmd->dev[0];return NULL here so that we get EIO (or other error) from caller ? Otherwise, the bug here could go unnoticed and bad IOs go to dev[0].
Ok.
Sure, we can reverse the order of devices by using the regular disk device as first and the SMR drive as second argument when creating the device-mapper device:+ + if (zmd->dev[0].zone_offset && + zone->id < zmd->dev[0].zone_offset) + return &zmd->dev[1]; + return &zmd->dev[0]; }OK. This one still confuses me. I think we need to have a comment here reminding that when there are 2 devices, the second one holds the low ID zones and the first one (the SMR drive) the high ID zones. While I think it is OK as is (with the comment), I still think we should reverse that as the reverse would be more natural...
0 <size> zoned <regular device> <zoned device>That would make it more natural, and we don't have to rearrange disks within the code.
We'd lose compability with v1, but one could argue it's not a bad thing. [ .. ]
@@ -693,60 +697,47 @@ static int dmz_map(struct dm_target *ti, struct bio *bio) /* * Get zoned device information. */ -static int dmz_get_zoned_device(struct dm_target *ti, char *path) +static int dmz_get_zoned_device(struct dm_target *ti, char *path, + struct dmz_dev *dev, int num) { struct dmz_target *dmz = ti->private; struct request_queue *q; - struct dmz_dev *dev; - sector_t aligned_capacity; int ret;/* Get the target device */- ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev); + ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), + &dmz->ddev[num]); if (ret) { ti->error = "Get target device failed"; - dmz->ddev = NULL; + dmz->ddev[num] = NULL; return ret; }- dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);- if (!dev) { - ret = -ENOMEM; - goto err; - } - - dev->bdev = dmz->ddev->bdev; + dev->bdev = dmz->ddev[num]->bdev; (void)bdevname(dev->bdev, dev->name);- if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {- ti->error = "Not a zoned block device"; - ret = -EINVAL; - goto err; - } + if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) + dev->flags = DMZ_BDEV_REGULAR;q = bdev_get_queue(dev->bdev);dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; - aligned_capacity = dev->capacity & - ~((sector_t)blk_queue_zone_sectors(q) - 1); - if (ti->begin || - ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) { - ti->error = "Partial mapping not supported"; - ret = -EINVAL; - goto err; + if (ti->begin) { + ti->error = "Partial mapping is not supported"; + dm_put_device(ti, dmz->ddev[num]); + dmz->ddev[num] = NULL; + return -EINVAL; }- dev->zone_nr_sectors = blk_queue_zone_sectors(q);- - dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk); - - dmz->dev = dev; - + if (num == 1) { + dev->zone_nr_sectors = dmz->dev[0].zone_nr_sectors; + dev->nr_zones = dev->capacity / dev->zone_nr_sectors; + if (dev->capacity % dev->nr_zones) + dev->nr_zones++;dev->nr_zones = (dev->capacity + dev->zone_nr_sectors - 1) / dev->zone_nr_sectors; or use DIV_ROUND_UP() ?
OK.
Or, better still, check if it's a regular device and drop the magic 'num == 1' comparison.And may be add a comment to remind (again) that znd->dev[1] is the normal disk so we cannot use blk_queue_zone_sectors() and blkdev_nr_zones().
Thanks for the review, will be sending a v4. Cheers, Hannes -- Dr. Hannes Reinecke Teamlead Storage & Networking hare@xxxxxxx +49 911 74053 688 SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel