On 2022-07-28 06:33, Damien Le Moal wrote: >> >> The area between target's zone capacity and zone size will be emulated >> in the target. >> The read IOs that fall in the emulated gap area will return 0 filled >> bio and all the other IOs in that area will result in an error. >> If a read IO span across the emulated area boundary, then the IOs are >> split across them. All other IO operations that span across the emulated >> area boundary will result in an error. >> >> The target can be easily created as follows: >> dmsetup create <label> --table '0 <size_sects> po2z /dev/nvme<id>' > > 0 -> <start offset> ? Or are you allowing only entire devices ? > Yeah, partially mapping is not allowed. >> >> Signed-off-by: Pankaj Raghav <p.raghav@xxxxxxxxxxx> >> Suggested-by: Johannes Thumshirn <johannes.thumshirn@xxxxxxx> >> Suggested-by: Damien Le Moal <damien.lemoal@xxxxxxx> >> Suggested-by: Hannes Reinecke <hare@xxxxxxx> >> --- >> drivers/md/Kconfig | 9 ++ >> drivers/md/Makefile | 2 + >> drivers/md/dm-po2z-target.c | 261 ++++++++++++++++++++++++++++++++++ >> drivers/md/dm-table.c | 13 +- >> drivers/md/dm-zone.c | 1 + >> include/linux/device-mapper.h | 9 ++ >> 6 files changed, 292 insertions(+), 3 deletions(-) >> create mode 100644 drivers/md/dm-po2z-target.c >> >> diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig >> index 998a5cfdbc4e..d58ccfee765b 100644 >> --- a/drivers/md/Kconfig >> +++ b/drivers/md/Kconfig >> @@ -518,6 +518,15 @@ config DM_FLAKEY >> help >> A target that intermittently fails I/O for debugging purposes. >> >> +config DM_PO2Z > > Maybe DM_PO2_ZONE ? > I prefer this. > or DM_ZONE_PO2SIZE ? > > To be clearer... > >> + >> +struct dm_po2z_target { >> + struct dm_dev *dev; >> + sector_t zone_size; /* Actual zone size of the underlying dev*/ >> + sector_t zone_size_po2; /* zone_size rounded to the nearest po2 value */ >> + sector_t zone_size_diff; /* diff between zone_size_po2 and zone_size */ >> + u32 nr_zones; > > s/u32/unsigned int. This is not a hw driver. > Ok. >> +} >> + >> +/* >> + * This target works on the complete zoned device. Partial mapping is not >> + * supported. >> + * Construct a zoned po2 logical device: <dev-path> >> + */ >> +static int dm_po2z_ctr(struct dm_target *ti, unsigned int argc, char **argv) >> +{ >> + struct dm_po2z_target *dmh = NULL; >> + int ret; >> + sector_t zone_size; >> + return -EINVAL; >> + } >> + >> + if (is_power_of_2(zone_size)) { >> + DMERR("%pg: this target is not useful for power-of-2 zoned devices", >> + dmh->dev->bdev); >> + return -EINVAL; > > Same here. > > And as suggested before, we could simply warn here but let the target be > created. If used with a power of 2 zone size device, it should still work. > I will do this in the next revision. I will change DMERR to DMWARN with no return err. >> + } >> + >> + dmh->zone_size = zone_size; >> + dmh->zone_size_po2 = 1 << get_count_order_long(zone_size); >> + dmh->zone_size_diff = dmh->zone_size_po2 - dmh->zone_size; >> + ti->private = dmh; >> + dmh->nr_zones = npo2_zone_no(dmh, ti->len); >> + ti->len = dmh->zone_size_po2 * dmh->nr_zones; >> + >> + return 0; >> +} >> + >> + >> +static int dm_po2z_end_io(struct dm_target *ti, struct bio *bio, >> + blk_status_t *error) >> +{ >> + struct dm_po2z_target *dmh = ti->private; >> + >> + if (bio->bi_status == BLK_STS_OK && bio_op(bio) == REQ_OP_ZONE_APPEND) >> + bio->bi_iter.bi_sector = >> + device_to_target_sect(dmh, bio->bi_iter.bi_sector); >> + >> + return DM_ENDIO_DONE; >> +} >> + >> +static void dm_po2z_io_hints(struct dm_target *ti, struct queue_limits *limits) >> +{ >> + struct dm_po2z_target *dmh = ti->private; >> + >> + limits->chunk_sectors = dmh->zone_size_po2; >> +} >> + >> +static bool bio_across_emulated_zone_area(struct dm_po2z_target *dmh, >> + struct bio *bio) >> +{ >> + u32 zone_idx = po2_zone_no(dmh, bio->bi_iter.bi_sector); >> + sector_t size = bio->bi_iter.bi_size >> SECTOR_SHIFT; > > Rename that to nr_sectors to be clear about the unit. > >> + >> + return (bio->bi_iter.bi_sector - (zone_idx * dmh->zone_size_po2) + >> + size) > dmh->zone_size; > > This is hard to read and actually seems very wrong. Shouldn't this be simply: > > return bio->bi_iter.bi_sector + nr_sectors > > zone_idx * dmh->zone_size_po2 + dmh->zone_size; > > Thatis, check that the BIO goes beyond the zone capacity of the target. > ? > Yeah, this reads better. I will change it. >> +} >> + >> +static int dm_po2z_map_read(struct dm_po2z_target *dmh, struct bio *bio) >> +{ >> + sector_t start_sect, size, relative_sect_in_zone, split_io_pos; >> + u32 zone_idx; >> + >> + /* >> + * Read does not extend into the emulated zone area (area between >> + * zone capacity and zone size) >> + */ >> + if (!bio_across_emulated_zone_area(dmh, bio)) { > > hu... why not use dm_accept_partial_bio() in dm_po2z_map() to never have > to deal with this here ? That is, dm_po2z_map_read() is called for any > read BIO fully within the zone capacity. And create a > dm_po2z_map_read_zero() helper for any read BIO that is after the zone > capacity. Way simpler I think. > >> + bio->bi_iter.bi_sector = >> + target_to_device_sect(dmh, bio->bi_iter.bi_sector); >> + return DM_MAPIO_REMAPPED; >> + } >> + >> + start_sect = bio->bi_iter.bi_sector; >> + size = bio->bi_iter.bi_size >> SECTOR_SHIFT; >> + zone_idx = po2_zone_no(dmh, start_sect); >> + relative_sect_in_zone = start_sect - (zone_idx * dmh->zone_size_po2); >> + >> + /* >> + * If the starting sector is in the emulated area then fill >> + * all the bio with zeros. If bio is across emulated zone boundary, >> + * split the bio across boundaries and fill zeros only for the >> + * bio that is between the zone capacity and the zone size. >> + */ >> + if (relative_sect_in_zone < dmh->zone_size && >> + ((relative_sect_in_zone + size) > dmh->zone_size)) { >> + split_io_pos = (zone_idx * dmh->zone_size_po2) + dmh->zone_size; >> + dm_accept_partial_bio(bio, split_io_pos - start_sect); > > See above. Do that in dm_po2z_map(). This will simplify this function. > >> + bio->bi_iter.bi_sector = target_to_device_sect(dmh, start_sect); >> + >> + return DM_MAPIO_REMAPPED; >> + } else if (relative_sect_in_zone >= dmh->zone_size && >> + ((relative_sect_in_zone + size) > dmh->zone_size_po2)) { > > No need for the else after return. And this can NEVER happen since BIOs > can never cross zone boundaries. > This did not happen and that is why this else if condition was introduced. But I had missed assigning the t->max_io_len in the ctr which led to bio crossing the zone boundary. With t->max_io_len set to zone_nr_sectors, we don't need the else if condition and we can simplify the map_read. Thanks for pointing it out. >> + split_io_pos = (zone_idx + 1) * dmh->zone_size_po2; >> + dm_accept_partial_bio(bio, split_io_pos - start_sect); >> + } >> + >> + zero_fill_bio(bio); >> + bio_endio(bio); >> + return DM_MAPIO_SUBMITTED; >> +} >> + >> +static int dm_po2z_map(struct dm_target *ti, struct bio *bio) >> +{ >> + struct dm_po2z_target *dmh = ti->private; >> + >> + bio_set_dev(bio, dmh->dev->bdev); >> + if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) { >> + /* >> + * Read operation on the emulated zone area (between zone capacity >> + * and zone size) will fill the bio with zeroes. Any other operation >> + * in the emulated area should return an error. >> + */ >> + if (bio_op(bio) == REQ_OP_READ) >> + return dm_po2z_map_read(dmh, bio); >> + >> + if (!bio_across_emulated_zone_area(dmh, bio)) { >> + bio->bi_iter.bi_sector = target_to_device_sect(dmh, >> + bio->bi_iter.bi_sector); >> + return DM_MAPIO_REMAPPED; >> + } >> + return DM_MAPIO_KILL; >> + } >> + return DM_MAPIO_REMAPPED; >> +} >> + >> +static int dm_po2z_iterate_devices(struct dm_target *ti, >> + iterate_devices_callout_fn fn, void *data) >> +{ >> + struct dm_po2z_target *dmh = ti->private; >> + sector_t len = dmh->nr_zones * dmh->zone_size; >> + >> + return fn(ti, dmh->dev, 0, len, data); >> +} >> + >> +static struct target_type dm_po2z_target = { >> + .name = "po2z", >> + .version = { 1, 0, 0 }, >> + .features = DM_TARGET_ZONED_HM | DM_TARGET_EMULATED_ZONES, >> + .map = dm_po2z_map, >> + .end_io = dm_po2z_end_io, >> + .report_zones = dm_po2z_report_zones, >> + .iterate_devices = dm_po2z_iterate_devices, >> + .module = THIS_MODULE, >> + .io_hints = dm_po2z_io_hints, >> + .ctr = dm_po2z_ctr, >> +}; >> + >> +static int __init dm_po2z_init(void) >> +{ >> + int r = dm_register_target(&dm_po2z_target); >> + >> + if (r < 0) >> + DMERR("register failed %d", r); >> + >> + return r; > > Simplify: > > return dm_register_target(&dm_po2z_target); > >> +} >> + >> +static void __exit dm_po2z_exit(void) >> +{ >> + dm_unregister_target(&dm_po2z_target); >> +} >> + >> +/* Module hooks */ >> +module_init(dm_po2z_init); >> +module_exit(dm_po2z_exit); >> + >> +MODULE_DESCRIPTION(DM_NAME "power-of-2 zoned target"); >> +MODULE_AUTHOR("Pankaj Raghav <p.raghav@xxxxxxxxxxx>"); >> +MODULE_LICENSE("GPL"); >> + > > the changes below should be a different prep patch. > Ok. `dm:introduce DM_EMULATED_ZONE target type` before introducing this new target. >> diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c >> index 534fddfc2b42..d77feff124af 100644 >> --- a/drivers/md/dm-table.c >> +++ b/drivers/md/dm-table.c >> @@ -1614,13 +1614,20 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, >> return true; >> } >> >> -static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, >> +/* >> + * Callback function to check for device zone sector across devices. If the >> + * DM_TARGET_EMULATED_ZONES target feature flag is not set, then the target >> + * should have the same zone sector as the underlying devices. >> + */ >> +static int check_valid_device_zone_sectors(struct dm_target *ti, struct dm_dev *dev, >> sector_t start, sector_t len, void *data) >> { >> unsigned int *zone_sectors = data; >> >> - if (!bdev_is_zoned(dev->bdev)) >> + if (!bdev_is_zoned(dev->bdev) || >> + dm_target_supports_emulated_zones(ti->type)) >> return 0; >> + >> return bdev_zone_sectors(dev->bdev) != *zone_sectors; >> } >> >> @@ -1646,7 +1653,7 @@ static int validate_hardware_zoned_model(struct dm_table *t, >> if (!zone_sectors) >> return -EINVAL; >> >> - if (dm_table_any_dev_attr(t, device_not_matches_zone_sectors, &zone_sectors)) { >> + if (dm_table_any_dev_attr(t, check_valid_device_zone_sectors, &zone_sectors)) { >> DMERR("%s: zone sectors is not consistent across all zoned devices", >> dm_device_name(t->md)); >> return -EINVAL; >> diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c >> index 31c16aafdbfc..2b6b3883471f 100644 >> --- a/drivers/md/dm-zone.c >> +++ b/drivers/md/dm-zone.c >> @@ -302,6 +302,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) >> if (dm_table_supports_zone_append(t)) { >> clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); >> dm_cleanup_zoned_dev(md); >> + >> return 0; >> } >> >> diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h >> index 920085dd7f3b..7dbd28b8de01 100644 >> --- a/include/linux/device-mapper.h >> +++ b/include/linux/device-mapper.h >> @@ -294,6 +294,15 @@ struct target_type { >> #define dm_target_supports_mixed_zoned_model(type) (false) >> #endif >> >> +#ifdef CONFIG_BLK_DEV_ZONED >> +#define DM_TARGET_EMULATED_ZONES 0x00000400 >> +#define dm_target_supports_emulated_zones(type) \ >> + ((type)->features & DM_TARGET_EMULATED_ZONES) >> +#else >> +#define DM_TARGET_EMULATED_ZONES 0x00000000 >> +#define dm_target_supports_emulated_zones(type) (false) >> +#endif >> + >> struct dm_target { >> struct dm_table *table; >> struct target_type *type; > >