On 3/9/22 01:53, Pankaj Raghav wrote: > power_of_2(PO2) emulation support is added to the null blk device to > measure performance. > > Callbacks are added in the hotpaths that require PO2 emulation specific > behaviour to reduce the impact on exisiting path. > > The power_of_2 emulation support is wired up for both the request and > the bio queue mode and it is automatically enabled when the given zone > size is non-power_of_2. This does not make any sense. Why would you want to add power of 2 zone size emulation to nullblk ? Just set the zone size to be a power of 2... If this is for test purpose, then use QEMU. These changes make no sense to me here. A change that would make sense in the context of this series is to allow for setting a zoned null_blk device zone size to a non power of 2 size. But this series does not actually deal with that. So do not touch this driver please. > > Signed-off-by: Pankaj Raghav <p.raghav@xxxxxxxxxxx> > --- > drivers/block/null_blk/main.c | 8 +- > drivers/block/null_blk/null_blk.h | 12 ++ > drivers/block/null_blk/zoned.c | 203 ++++++++++++++++++++++++++---- > 3 files changed, 196 insertions(+), 27 deletions(-) > > diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c > index 625a06bfa5ad..c926b59f2b17 100644 > --- a/drivers/block/null_blk/main.c > +++ b/drivers/block/null_blk/main.c > @@ -210,7 +210,7 @@ MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Defau > > static unsigned long g_zone_size = 256; > module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); > -MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); > +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Default: 256"); > > static unsigned long g_zone_capacity; > module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); > @@ -1772,11 +1772,13 @@ static const struct block_device_operations null_bio_ops = { > .owner = THIS_MODULE, > .submit_bio = null_submit_bio, > .report_zones = null_report_zones, > + .npo2_zone_setup = null_po2_zone_emu_setup, > }; > > static const struct block_device_operations null_rq_ops = { > .owner = THIS_MODULE, > .report_zones = null_report_zones, > + .npo2_zone_setup = null_po2_zone_emu_setup, > }; > > static int setup_commands(struct nullb_queue *nq) > @@ -1929,8 +1931,8 @@ static int null_validate_conf(struct nullb_device *dev) > dev->mbps = 0; > > if (dev->zoned && > - (!dev->zone_size || !is_power_of_2(dev->zone_size))) { > - pr_err("zone_size must be power-of-two\n"); > + (!dev->zone_size)) { > + pr_err("zone_size must be zero\n"); > return -EINVAL; > } > > diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h > index 78eb56b0ca55..34c1b7b2546b 100644 > --- a/drivers/block/null_blk/null_blk.h > +++ b/drivers/block/null_blk/null_blk.h > @@ -74,6 +74,16 @@ struct nullb_device { > unsigned int imp_close_zone_no; > struct nullb_zone *zones; > sector_t zone_size_sects; > + sector_t zone_size_po2_sects; > + sector_t zone_size_diff_sects; > + /* The callbacks below are used as hook to perform po2 emulation for a > + * zoned device. > + */ > + unsigned int (*zone_no)(struct nullb_device *dev, > + sector_t sect); > + sector_t (*zone_update_sector)(struct nullb_device *dev, sector_t sect); > + sector_t (*zone_update_sector_append)(struct nullb_device *dev, > + sector_t sect); > bool need_zone_res_mgmt; > spinlock_t zone_res_lock; > > @@ -137,6 +147,7 @@ int null_register_zoned_dev(struct nullb *nullb); > void null_free_zoned_dev(struct nullb_device *dev); > int null_report_zones(struct gendisk *disk, sector_t sector, > unsigned int nr_zones, report_zones_cb cb, void *data); > +void null_po2_zone_emu_setup(struct gendisk *disk); > blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, > enum req_opf op, sector_t sector, > sector_t nr_sectors); > @@ -166,5 +177,6 @@ static inline size_t null_zone_valid_read_len(struct nullb *nullb, > return len; > } > #define null_report_zones NULL > +#define null_po2_zone_emu_setup NULL > #endif /* CONFIG_BLK_DEV_ZONED */ > #endif /* __NULL_BLK_H */ > diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c > index dae54dd1aeac..3bb63c170149 100644 > --- a/drivers/block/null_blk/zoned.c > +++ b/drivers/block/null_blk/zoned.c > @@ -16,6 +16,44 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) > return sect >> ilog2(dev->zone_size_sects); > } > > +static inline unsigned int null_npo2_zone_no(struct nullb_device *dev, > + sector_t sect) > +{ > + return sect / dev->zone_size_sects; > +} > + > +static inline bool null_is_po2_zone_emu(struct nullb_device *dev) > +{ > + return !!dev->zone_size_diff_sects; > +} > + > +static inline sector_t null_zone_update_sector_noop(struct nullb_device *dev, > + sector_t sect) > +{ > + return sect; > +} > + > +static inline sector_t null_zone_update_sector_po2_emu(struct nullb_device *dev, > + sector_t sect) > +{ > + sector_t zsze_po2 = dev->zone_size_po2_sects; > + sector_t zsze_diff = dev->zone_size_diff_sects; > + u32 zone_idx = sect >> ilog2(zsze_po2); > + > + sect = sect - (zone_idx * zsze_diff); > + return sect; > +} > + > +static inline sector_t > +null_zone_update_sector_append_po2_emu(struct nullb_device *dev, sector_t sect) > +{ > + /* Need to readjust the sector if po2 emulation is used. */ > + u32 zone_no = dev->zone_no(dev, sect); > + > + sect += dev->zone_size_diff_sects * zone_no; > + return sect; > +} > + > static inline void null_lock_zone_res(struct nullb_device *dev) > { > if (dev->need_zone_res_mgmt) > @@ -62,15 +100,14 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) > sector_t sector = 0; > unsigned int i; > > - if (!is_power_of_2(dev->zone_size)) { > - pr_err("zone_size must be power-of-two\n"); > - return -EINVAL; > - } > if (dev->zone_size > dev->size) { > pr_err("Zone size larger than device capacity\n"); > return -EINVAL; > } > > + if (!is_power_of_2(dev->zone_size)) > + pr_info("zone_size is not power-of-two. power-of-two emulation is enabled"); > + > if (!dev->zone_capacity) > dev->zone_capacity = dev->zone_size; > > @@ -83,8 +120,14 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) > zone_capacity_sects = mb_to_sects(dev->zone_capacity); > dev_capacity_sects = mb_to_sects(dev->size); > dev->zone_size_sects = mb_to_sects(dev->zone_size); > - dev->nr_zones = round_up(dev_capacity_sects, dev->zone_size_sects) > - >> ilog2(dev->zone_size_sects); > + > + dev->nr_zones = roundup(dev_capacity_sects, dev->zone_size_sects) / > + dev->zone_size_sects; > + > + dev->zone_no = null_zone_no; > + dev->zone_update_sector = null_zone_update_sector_noop; > + dev->zone_update_sector_append = null_zone_update_sector_noop; > + > > dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone), > GFP_KERNEL | __GFP_ZERO); > @@ -166,7 +209,13 @@ int null_register_zoned_dev(struct nullb *nullb) > if (ret) > return ret; > } else { > - blk_queue_chunk_sectors(q, dev->zone_size_sects); > + nullb->disk->fops->npo2_zone_setup(nullb->disk); > + > + if (null_is_po2_zone_emu(dev)) > + blk_queue_chunk_sectors(q, dev->zone_size_po2_sects); > + else > + blk_queue_chunk_sectors(q, dev->zone_size_sects); > + > q->nr_zones = blkdev_nr_zones(nullb->disk); > } > > @@ -183,17 +232,49 @@ void null_free_zoned_dev(struct nullb_device *dev) > dev->zones = NULL; > } > > +static void null_update_zone_info(struct nullb *nullb, struct blk_zone *blkz, > + struct nullb_zone *zone) > +{ > + unsigned int zone_idx; > + u64 zone_gap; > + struct nullb_device *dev = nullb->dev; > + > + if (null_is_po2_zone_emu(dev)) { > + zone_idx = zone->start / zone->len; > + zone_gap = zone_idx * dev->zone_size_diff_sects; > + > + blkz->start = zone->start + zone_gap; > + blkz->len = dev->zone_size_po2_sects; > + blkz->wp = zone->wp + zone_gap; > + } else { > + blkz->start = zone->start; > + blkz->len = zone->len; > + blkz->wp = zone->wp; > + } > + > + blkz->type = zone->type; > + blkz->cond = zone->cond; > + blkz->capacity = zone->capacity; > +} > + > int null_report_zones(struct gendisk *disk, sector_t sector, > unsigned int nr_zones, report_zones_cb cb, void *data) > { > struct nullb *nullb = disk->private_data; > struct nullb_device *dev = nullb->dev; > unsigned int first_zone, i; > + sector_t zone_size; > struct nullb_zone *zone; > struct blk_zone blkz; > int error; > > - first_zone = null_zone_no(dev, sector); > + if (null_is_po2_zone_emu(dev)) > + zone_size = dev->zone_size_po2_sects; > + else > + zone_size = dev->zone_size_sects; > + > + first_zone = sector / zone_size; > + > if (first_zone >= dev->nr_zones) > return 0; > > @@ -210,12 +291,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector, > * array. > */ > null_lock_zone(dev, zone); > - blkz.start = zone->start; > - blkz.len = zone->len; > - blkz.wp = zone->wp; > - blkz.type = zone->type; > - blkz.cond = zone->cond; > - blkz.capacity = zone->capacity; > + null_update_zone_info(nullb, &blkz, zone); > null_unlock_zone(dev, zone); > > error = cb(&blkz, i, data); > @@ -226,6 +302,35 @@ int null_report_zones(struct gendisk *disk, sector_t sector, > return nr_zones; > } > > +void null_po2_zone_emu_setup(struct gendisk *disk) > +{ > + struct nullb *nullb = disk->private_data; > + struct nullb_device *dev = nullb->dev; > + sector_t capacity; > + > + if (is_power_of_2(dev->zone_size_sects)) > + return; > + > + if (!get_capacity(disk)) > + return; > + > + blk_mq_freeze_queue(disk->queue); > + > + blk_queue_po2_zone_emu(disk->queue, 1); > + dev->zone_size_po2_sects = > + 1 << get_count_order_long(dev->zone_size_sects); > + dev->zone_size_diff_sects = > + dev->zone_size_po2_sects - dev->zone_size_sects; > + dev->zone_no = null_npo2_zone_no; > + dev->zone_update_sector = null_zone_update_sector_po2_emu; > + dev->zone_update_sector_append = null_zone_update_sector_append_po2_emu; > + > + capacity = dev->nr_zones * dev->zone_size_po2_sects; > + set_capacity(disk, capacity); > + > + blk_mq_unfreeze_queue(disk->queue); > +} > + > /* > * This is called in the case of memory backing from null_process_cmd() > * with the target zone already locked. > @@ -234,7 +339,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb, > sector_t sector, unsigned int len) > { > struct nullb_device *dev = nullb->dev; > - struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; > + struct nullb_zone *zone = &dev->zones[dev->zone_no(dev, sector)]; > unsigned int nr_sectors = len >> SECTOR_SHIFT; > > /* Read must be below the write pointer position */ > @@ -363,11 +468,24 @@ static blk_status_t null_check_zone_resources(struct nullb_device *dev, > } > } > > +static void null_update_sector_append(struct nullb_cmd *cmd, sector_t sector) > +{ > + struct nullb_device *dev = cmd->nq->dev; > + > + sector = dev->zone_update_sector_append(dev, sector); > + > + if (cmd->bio) { > + cmd->bio->bi_iter.bi_sector = sector; > + } else { > + cmd->rq->__sector = sector; > + } > +} > + > static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, > unsigned int nr_sectors, bool append) > { > struct nullb_device *dev = cmd->nq->dev; > - unsigned int zno = null_zone_no(dev, sector); > + unsigned int zno = dev->zone_no(dev, sector); > struct nullb_zone *zone = &dev->zones[zno]; > blk_status_t ret; > > @@ -395,10 +513,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, > */ > if (append) { > sector = zone->wp; > - if (cmd->bio) > - cmd->bio->bi_iter.bi_sector = sector; > - else > - cmd->rq->__sector = sector; > + null_update_sector_append(cmd, sector); > } else if (sector != zone->wp) { > ret = BLK_STS_IOERR; > goto unlock; > @@ -619,7 +734,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, > return BLK_STS_OK; > } > > - zone_no = null_zone_no(dev, sector); > + zone_no = dev->zone_no(dev, sector); > zone = &dev->zones[zone_no]; > > null_lock_zone(dev, zone); > @@ -650,13 +765,54 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, > return ret; > } > > +static blk_status_t null_handle_po2_zone_emu_violation(struct nullb_cmd *cmd, > + enum req_opf op) > +{ > + if (op == REQ_OP_READ) { > + if (cmd->bio) > + zero_fill_bio(cmd->bio); > + else > + zero_fill_bio(cmd->rq->bio); > + > + return BLK_STS_OK; > + } else { > + return BLK_STS_IOERR; > + } > +} > + > +static bool null_verify_sector_violation(struct nullb_device *dev, > + sector_t sector) > +{ > + if (unlikely(null_is_po2_zone_emu(dev))) { > + /* The zone idx should be calculated based on the emulated > + * layout > + */ > + u32 zone_idx = sector >> ilog2(dev->zone_size_po2_sects); > + sector_t zsze = dev->zone_size_sects; > + sector_t sect = null_zone_update_sector_po2_emu(dev, sector); > + > + if (sect - (zone_idx * zsze) > zsze) > + return true; > + } > + return false; > +} > + > blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, > sector_t sector, sector_t nr_sectors) > { > - struct nullb_device *dev; > + struct nullb_device *dev = cmd->nq->dev; > struct nullb_zone *zone; > blk_status_t sts; > > + /* Handle when the sector falls in the emulated area */ > + if (unlikely(null_verify_sector_violation(dev, sector))) > + return null_handle_po2_zone_emu_violation(cmd, op); > + > + /* The sector value is updated if po2 emulation is enabled, else it > + * will have no effect on the value > + */ > + sector = dev->zone_update_sector(dev, sector); > + > switch (op) { > case REQ_OP_WRITE: > return null_zone_write(cmd, sector, nr_sectors, false); > @@ -669,8 +825,7 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, > case REQ_OP_ZONE_FINISH: > return null_zone_mgmt(cmd, op, sector); > default: > - dev = cmd->nq->dev; > - zone = &dev->zones[null_zone_no(dev, sector)]; > + zone = &dev->zones[dev->zone_no(dev, sector)]; > > null_lock_zone(dev, zone); > sts = null_process_cmd(cmd, op, sector, nr_sectors); -- Damien Le Moal Western Digital Research