On 2020/05/31 22:06, Hannes Reinecke wrote: > On 5/31/20 11:10 AM, Damien Le Moal wrote: >> On Fri, 2020-05-29 at 19:39 +0200, Hannes Reinecke wrote: >>> Remove the hard-coded limit of two devices and support an unlimited >>> number of additional zoned devices. >>> With that we need to increase the device-mapper version number to >>> 3.0.0 as we've modified the interface. >>> >>> Signed-off-by: Hannes Reinecke <hare@xxxxxxx> >>> --- >>> drivers/md/dm-zoned-metadata.c | 15 +++++- >>> drivers/md/dm-zoned-target.c | 106 ++++++++++++++++++++++++----------------- >>> 2 files changed, 75 insertions(+), 46 deletions(-) >>> >>> diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c >>> index 044c152eb756..221163ae5f68 100644 >>> --- a/drivers/md/dm-zoned-metadata.c >>> +++ b/drivers/md/dm-zoned-metadata.c >>> @@ -1523,7 +1523,20 @@ static int dmz_init_zones(struct dmz_metadata *zmd) >>> */ >>> zmd->sb[0].zone = dmz_get(zmd, 0); >>> >>> - zoned_dev = &zmd->dev[1]; >>> + for (i = 1; i < zmd->nr_devs; i++) { >>> + zoned_dev = &zmd->dev[i]; >>> + >>> + ret = blkdev_report_zones(zoned_dev->bdev, 0, >>> + BLK_ALL_ZONES, >>> + dmz_init_zone, zoned_dev); >>> + if (ret < 0) { >>> + DMDEBUG("(%s): Failed to report zones, error %d", >>> + zmd->devname, ret); >>> + dmz_drop_zones(zmd); >>> + return ret; >>> + } >>> + } >>> + return 0; >>> } >>> >>> /* >>> diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c >>> index aa3d26d16441..4a51738d4b0d 100644 >>> --- a/drivers/md/dm-zoned-target.c >>> +++ b/drivers/md/dm-zoned-target.c >>> @@ -13,8 +13,6 @@ >>> >>> #define DMZ_MIN_BIOS 8192 >>> >>> -#define DMZ_MAX_DEVS 2 >>> - >>> /* >>> * Zone BIO context. >>> */ >>> @@ -40,10 +38,10 @@ struct dm_chunk_work { >>> * Target descriptor. >>> */ >>> struct dmz_target { >>> - struct dm_dev *ddev[DMZ_MAX_DEVS]; >>> + struct dm_dev **ddev; >>> unsigned int nr_ddevs; >>> >>> - unsigned long flags; >>> + unsigned int flags; >>> >>> /* Zoned block device information */ >>> struct dmz_dev *dev; >>> @@ -764,7 +762,7 @@ static void dmz_put_zoned_device(struct dm_target *ti) >>> struct dmz_target *dmz = ti->private; >>> int i; >>> >>> - for (i = 0; i < DMZ_MAX_DEVS; i++) { >>> + for (i = 0; i < dmz->nr_ddevs; i++) { >>> if (dmz->ddev[i]) { >>> dm_put_device(ti, dmz->ddev[i]); >>> dmz->ddev[i] = NULL; >>> @@ -777,21 +775,35 @@ static int dmz_fixup_devices(struct dm_target *ti) >>> struct dmz_target *dmz = ti->private; >>> struct dmz_dev *reg_dev, *zoned_dev; >>> struct request_queue *q; >>> + sector_t zone_nr_sectors = 0; >>> + int i; >>> >>> /* >>> - * When we have two devices, the first one must be a regular block >>> - * device and the second a zoned block device. >>> + * When we have more than on devices, the first one must be a >>> + * regular block device and the others zoned block devices. >>> */ >>> - if (dmz->ddev[0] && dmz->ddev[1]) { >>> + if (dmz->nr_ddevs > 1) { >>> reg_dev = &dmz->dev[0]; >>> if (!(reg_dev->flags & DMZ_BDEV_REGULAR)) { >>> ti->error = "Primary disk is not a regular device"; >>> return -EINVAL; >>> } >>> - zoned_dev = &dmz->dev[1]; >>> - if (zoned_dev->flags & DMZ_BDEV_REGULAR) { >>> - ti->error = "Secondary disk is not a zoned device"; >>> - return -EINVAL; >>> + for (i = 1; i < dmz->nr_ddevs; i++) { >>> + zoned_dev = &dmz->dev[i]; >>> + if (zoned_dev->flags & DMZ_BDEV_REGULAR) { >>> + ti->error = "Secondary disk is not a zoned device"; >>> + return -EINVAL; >>> + } >>> + q = bdev_get_queue(zoned_dev->bdev); >>> + if (zone_nr_sectors && >>> + zone_nr_sectors != blk_queue_zone_sectors(q)) { >>> + ti->error = "Zone nr sectors mismatch"; >>> + return -EINVAL; >>> + } >>> + zone_nr_sectors = blk_queue_zone_sectors(q); >>> + zoned_dev->zone_nr_sectors = zone_nr_sectors; >>> + zoned_dev->nr_zones = >>> + blkdev_nr_zones(zoned_dev->bdev->bd_disk); >>> } >>> } else { >>> reg_dev = NULL; >>> @@ -800,17 +812,24 @@ static int dmz_fixup_devices(struct dm_target *ti) >>> ti->error = "Disk is not a zoned device"; >>> return -EINVAL; >>> } >>> + q = bdev_get_queue(zoned_dev->bdev); >>> + zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); >>> + zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); >>> } >>> - q = bdev_get_queue(zoned_dev->bdev); >>> - zoned_dev->zone_nr_sectors = blk_queue_zone_sectors(q); >>> - zoned_dev->nr_zones = blkdev_nr_zones(zoned_dev->bdev->bd_disk); >>> >>> if (reg_dev) { >>> - reg_dev->zone_nr_sectors = zoned_dev->zone_nr_sectors; >>> + sector_t zone_offset; >>> + >>> + reg_dev->zone_nr_sectors = zone_nr_sectors; >>> reg_dev->nr_zones = >>> DIV_ROUND_UP_SECTOR_T(reg_dev->capacity, >>> reg_dev->zone_nr_sectors); >>> - zoned_dev->zone_offset = reg_dev->nr_zones; >>> + reg_dev->zone_offset = 0; >>> + zone_offset = reg_dev->nr_zones; >>> + for (i = 1; i < dmz->nr_ddevs; i++) { >>> + dmz->dev[i].zone_offset = zone_offset; >>> + zone_offset += dmz->dev[i].nr_zones; >>> + } >>> } >>> return 0; >>> } >>> @@ -824,7 +843,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) >>> int ret, i; >>> >>> /* Check arguments */ >>> - if (argc < 1 || argc > 2) { >>> + if (argc < 1) { >>> ti->error = "Invalid argument count"; >>> return -EINVAL; >>> } >>> @@ -835,32 +854,31 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) >>> ti->error = "Unable to allocate the zoned target descriptor"; >>> return -ENOMEM; >>> } >>> - dmz->dev = kcalloc(2, sizeof(struct dmz_dev), GFP_KERNEL); >>> + dmz->dev = kcalloc(argc, sizeof(struct dmz_dev), GFP_KERNEL); >>> if (!dmz->dev) { >>> ti->error = "Unable to allocate the zoned device descriptors"; >>> kfree(dmz); >>> return -ENOMEM; >>> } >>> + dmz->ddev = kcalloc(argc, sizeof(struct dm_dev *), GFP_KERNEL); >>> + if (!dmz->ddev) { >>> + ti->error = "Unable to allocate the dm device descriptors"; >>> + ret = -ENOMEM; >>> + goto err; >>> + } >>> dmz->nr_ddevs = argc; >>> + >>> ti->private = dmz; >>> >>> /* Get the target zoned block device */ >>> - ret = dmz_get_zoned_device(ti, argv[0], 0, argc); >>> - if (ret) >>> - goto err; >>> - >>> - if (argc == 2) { >>> - ret = dmz_get_zoned_device(ti, argv[1], 1, argc); >>> - if (ret) { >>> - dmz_put_zoned_device(ti); >>> - goto err; >>> - } >>> + for (i = 0; i < argc; i++) { >>> + ret = dmz_get_zoned_device(ti, argv[i], i, argc); >>> + if (ret) >>> + goto err_dev; >>> } >>> ret = dmz_fixup_devices(ti); >>> - if (ret) { >>> - dmz_put_zoned_device(ti); >>> - goto err; >>> - } >>> + if (ret) >>> + goto err_dev; >>> >>> /* Initialize metadata */ >>> ret = dmz_ctr_metadata(dmz->dev, argc, &dmz->metadata, >>> @@ -1056,13 +1074,13 @@ static int dmz_iterate_devices(struct dm_target *ti, >>> struct dmz_target *dmz = ti->private; >>> unsigned int zone_nr_sectors = dmz_zone_nr_sectors(dmz->metadata); >>> sector_t capacity; >>> - int r; >>> + int i, r; >>> >>> - capacity = dmz->dev[0].capacity & ~(zone_nr_sectors - 1); >>> - r = fn(ti, dmz->ddev[0], 0, capacity, data); >>> - if (!r && dmz->ddev[1]) { >>> - capacity = dmz->dev[1].capacity & ~(zone_nr_sectors - 1); >>> - r = fn(ti, dmz->ddev[1], 0, capacity, data); >>> + for (i = 0; i < dmz->nr_ddevs; i++) { >>> + capacity = dmz->dev[i].capacity & ~(zone_nr_sectors - 1); >>> + r = fn(ti, dmz->ddev[i], 0, capacity, data); >>> + if (r) >>> + break; >>> } >>> return r; >>> } >>> @@ -1083,9 +1101,7 @@ static void dmz_status(struct dm_target *ti, status_type_t type, >>> dmz_nr_zones(dmz->metadata), >>> dmz_nr_unmap_cache_zones(dmz->metadata), >>> dmz_nr_cache_zones(dmz->metadata)); >>> - for (i = 0; i < DMZ_MAX_DEVS; i++) { >>> - if (!dmz->ddev[i]) >>> - continue; >>> + for (i = 0; i < dmz->nr_ddevs; i++) { >>> /* >>> * For a multi-device setup the first device >>> * contains only cache zones. >>> @@ -1104,8 +1120,8 @@ static void dmz_status(struct dm_target *ti, status_type_t type, >>> dev = &dmz->dev[0]; >>> format_dev_t(buf, dev->bdev->bd_dev); >>> DMEMIT("%s", buf); >>> - if (dmz->dev[1].bdev) { >>> - dev = &dmz->dev[1]; >>> + for (i = 1; i < dmz->nr_ddevs; i++) { >>> + dev = &dmz->dev[i]; >>> format_dev_t(buf, dev->bdev->bd_dev); >>> DMEMIT(" %s", buf); >>> } >>> @@ -1133,7 +1149,7 @@ static int dmz_message(struct dm_target *ti, unsigned int argc, char **argv, >>> >>> static struct target_type dmz_type = { >>> .name = "zoned", >>> - .version = {2, 0, 0}, >>> + .version = {3, 0, 0}, >>> .features = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM, >>> .module = THIS_MODULE, >>> .ctr = dmz_ctr, >> >> Looks all good to me, but thinking more about it, don't we need to add >> a device index in the super blocks ? The reason is that if the drive >> configuration changes between stopt/start (drives removed, added or >> changed slots), the drive names will change and while the userspace >> will still be able to find the group of drives constituting the target >> (using UUID9, there is no obvious way to find out what the original >> drive order was. Since the kernel side relies on the drive being passed >> to the ctr function in the order of the mapping, we need to preserve >> that. Or change also the kernel side to use the index in the super >> block to put each drive in its correct dmz->dev[] slot. >> > Already taken care of; here's where the tertiary superblocks come in. > Each superblock carries its own position (in the 'sb_block' field). > This is the _absolute_ position within the entire setup, not the > relative per-device block number. > And it also has the absolute number of blocks in the 'nr_chunks' field. > > Hence we know exactly where this superblock (and, by implication, the > zones following this superblock) should end up. And we know how large > the entire setup will be. So can insert the superblock at the right > position and then can check if we have enough zones for the entire > device. I do not get it though. Where is that checked ? At least in this patch, drives are initialized in the order of the ctr arguments, and this loop: + for (i = 1; i < dmz->nr_ddevs; i++) { + dmz->dev[i].zone_offset = zone_offset; + zone_offset += dmz->dev[i].nr_zones; + } in dmz_fixup_devices() sets the zone offset for each device in the same order. So for a given chunk mapped to a zone identified by its ID, if the device order changes, zone ID will change and the chunk will not be mapped to the correct zone. What am I missing here ? > > Not sure if the dmzadm does it, though; but should be easy enough to > implement. > > Cheers, > > Hannes > -- Damien Le Moal Western Digital Research -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel