On 2020/08/26 7:52, Damien Le Moal wrote: > On 2020/08/25 21:22, Niklas Cassel wrote: >> Add support for user space to set a max open zone and a max active zone >> limit via configfs. By default, the default value is 0 == no limit. > > s/value is/values are/ > >> >> Call the block layer API functions used for exposing the configured >> limits to sysfs. >> >> Add accounting in null_blk_zoned so that these new limits are respected. >> Performing an operating that would exceed these limits results is a > > s/is a/in a/ > >> standard I/O error. >> >> A max open zone limit exists in the ZBC standard. >> While null_blk_zoned is used to test the Zoned Block Device model in >> Linux, when it comes to differences between ZBC and ZNS, null_blk_zoned >> mostly follows ZBC. >> >> Therefore, implement the manage open zone resources function from ZBC, >> but additionally add support for max active zones. >> This enables user space not only to test against a device with an open >> zone limit, but also to test against a device with an active zone limit. >> >> Signed-off-by: Niklas Cassel <niklas.cassel@xxxxxxx> >> --- >> drivers/block/null_blk.h | 5 + >> drivers/block/null_blk_main.c | 16 +- >> drivers/block/null_blk_zoned.c | 316 +++++++++++++++++++++++++++------ >> 3 files changed, 283 insertions(+), 54 deletions(-) >> >> diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h >> index daed4a9c34367..d2e7db43a52a7 100644 >> --- a/drivers/block/null_blk.h >> +++ b/drivers/block/null_blk.h >> @@ -42,6 +42,9 @@ struct nullb_device { >> struct badblocks badblocks; >> >> unsigned int nr_zones; >> + unsigned int nr_zones_imp_open; >> + unsigned int nr_zones_exp_open; >> + unsigned int nr_zones_closed; >> struct blk_zone *zones; >> sector_t zone_size_sects; >> >> @@ -51,6 +54,8 @@ struct nullb_device { >> unsigned long zone_size; /* zone size in MB if device is zoned */ >> unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ >> unsigned int zone_nr_conv; /* number of conventional zones */ >> + unsigned int zone_max_open; /* max number of open zones */ >> + unsigned int zone_max_active; /* max number of active zones */ >> unsigned int submit_queues; /* number of submission queues */ >> unsigned int home_node; /* home node for the device */ >> unsigned int queue_mode; /* block interface */ >> diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c >> index d74443a9c8fa2..53161a418611b 100644 >> --- a/drivers/block/null_blk_main.c >> +++ b/drivers/block/null_blk_main.c >> @@ -208,6 +208,14 @@ static unsigned int g_zone_nr_conv; >> module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); >> MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); >> >> +static unsigned int g_zone_max_open; >> +module_param_named(zone_max_open, g_zone_max_open, uint, 0444); >> +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)"); >> + >> +static unsigned int g_zone_max_active; >> +module_param_named(zone_max_active, g_zone_max_active, uint, 0444); >> +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); >> + >> static struct nullb_device *null_alloc_dev(void); >> static void null_free_dev(struct nullb_device *dev); >> static void null_del_dev(struct nullb *nullb); >> @@ -347,6 +355,8 @@ NULLB_DEVICE_ATTR(zoned, bool, NULL); >> NULLB_DEVICE_ATTR(zone_size, ulong, NULL); >> NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); >> NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); >> +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); >> +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); >> >> static ssize_t nullb_device_power_show(struct config_item *item, char *page) >> { >> @@ -464,6 +474,8 @@ static struct configfs_attribute *nullb_device_attrs[] = { >> &nullb_device_attr_zone_size, >> &nullb_device_attr_zone_capacity, >> &nullb_device_attr_zone_nr_conv, >> + &nullb_device_attr_zone_max_open, >> + &nullb_device_attr_zone_max_active, >> NULL, >> }; >> >> @@ -517,7 +529,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) >> static ssize_t memb_group_features_show(struct config_item *item, char *page) >> { >> return snprintf(page, PAGE_SIZE, >> - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n"); >> + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n"); >> } >> >> CONFIGFS_ATTR_RO(memb_group_, features); >> @@ -580,6 +592,8 @@ static struct nullb_device *null_alloc_dev(void) >> dev->zone_size = g_zone_size; >> dev->zone_capacity = g_zone_capacity; >> dev->zone_nr_conv = g_zone_nr_conv; >> + dev->zone_max_open = g_zone_max_open; >> + dev->zone_max_active = g_zone_max_active; >> return dev; >> } >> >> diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c >> index 3d25c9ad23831..5fb38c9bdd4ae 100644 >> --- a/drivers/block/null_blk_zoned.c >> +++ b/drivers/block/null_blk_zoned.c >> @@ -51,6 +51,24 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) >> dev->zone_nr_conv); >> } >> >> + /* Max active zones has to be <= number of sequential zones */ >> + if (dev->zone_max_active > dev->nr_zones - dev->zone_nr_conv) { >> + dev->zone_max_active = dev->nr_zones - dev->zone_nr_conv; >> + pr_info("changed the maximum number of active zones to %u\n", >> + dev->nr_zones); >> + } > > if dev->zone_max_active == dev->nr_zones - dev->zone_nr_conv, you could also > change dev->zone_max_active to 0, since that is equivalent. Not a blocker > though, I think. > >> + >> + /* Max open zones has to be <= max active zones */ >> + if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { >> + dev->zone_max_open = dev->zone_max_active; >> + pr_info("changed the maximum number of open zones to %u\n", >> + dev->nr_zones); >> + } else if (dev->zone_max_open > dev->nr_zones - dev->zone_nr_conv) { >> + dev->zone_max_open = dev->nr_zones - dev->zone_nr_conv; >> + pr_info("changed the maximum number of open zones to %u\n", >> + dev->nr_zones); >> + } > > And same here for zone_max_open. > >> + >> for (i = 0; i < dev->zone_nr_conv; i++) { >> struct blk_zone *zone = &dev->zones[i]; >> >> @@ -99,6 +117,8 @@ int null_register_zoned_dev(struct nullb *nullb) >> } >> >> blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); >> + blk_queue_max_open_zones(q, dev->zone_max_open); >> + blk_queue_max_active_zones(q, dev->zone_max_active); >> >> return 0; >> } >> @@ -159,6 +179,99 @@ size_t null_zone_valid_read_len(struct nullb *nullb, >> return (zone->wp - sector) << SECTOR_SHIFT; >> } >> >> +static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone) >> +{ >> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) >> + return BLK_STS_IOERR; >> + >> + switch (zone->cond) { >> + case BLK_ZONE_COND_CLOSED: >> + /* close operation on closed is not an error */ >> + return BLK_STS_OK; >> + case BLK_ZONE_COND_IMP_OPEN: >> + dev->nr_zones_imp_open--; >> + break; >> + case BLK_ZONE_COND_EXP_OPEN: >> + dev->nr_zones_exp_open--; >> + break; >> + case BLK_ZONE_COND_EMPTY: >> + case BLK_ZONE_COND_FULL: >> + default: >> + return BLK_STS_IOERR; >> + } >> + >> + if (zone->wp == zone->start) { >> + zone->cond = BLK_ZONE_COND_EMPTY; >> + } else { >> + zone->cond = BLK_ZONE_COND_CLOSED; >> + dev->nr_zones_closed++; >> + } >> + >> + return BLK_STS_OK; >> +} >> + >> +static void null_close_first_imp_zone(struct nullb_device *dev) >> +{ >> + unsigned int i; >> + >> + for (i = 0; i < dev->nr_zones; i++) { > > You can start the loop from dev->nr_conv_zones, that will avoid the first if below. > >> + if (dev->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL) >> + continue; >> + if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) { >> + null_close_zone(dev, &dev->zones[i]); >> + return; >> + } >> + } >> +} >> + >> +static bool null_room_in_active(struct nullb_device *dev) > > The name is a little strange. What about null_can_set_active() ? > >> +{ >> + if (!dev->zone_max_active) >> + return true; >> + >> + return dev->nr_zones_exp_open + dev->nr_zones_imp_open + >> + dev->nr_zones_closed < dev->zone_max_active; >> +} >> + >> +static bool null_room_in_open(struct nullb_device *dev) > > Same here: null_can_open() ? > >> +{ >> + if (!dev->zone_max_open) >> + return true; >> + >> + return dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open; >> +} >> + >> +/* >> + * This function matches the manage open zone resources function in the ZBC standard, >> + * with the addition of max active zones support (added in the ZNS standard). >> + * >> + * ZBC states that an implicitly open zone shall be closed only if there is not >> + * room within the open limit. However, if an active limit is set, it is not certain >> + * that an implicitly opened zone can be closed without exceeding the active limit. > > Hu... imp open and close both being an active state, closing an imp open zone > will never result in exceeding the max active limit. Did you mean something like > "closing an imp open zone to open another one" ? or something else ? This needs > clarification as the function name does not really tell us what this is trying > to do. It looks like this is checking if a zone can be open... So why not merge > that into null_room_in_open(), renamed null_can_open() ? > >> + */ >> +static bool null_manage_zone_resources(struct nullb_device *dev, struct blk_zone *zone) >> +{ >> + switch (zone->cond) { >> + case BLK_ZONE_COND_EMPTY: >> + if (!null_room_in_active(dev)) >> + return false; >> + fallthrough; >> + case BLK_ZONE_COND_CLOSED: >> + if (null_room_in_open(dev)) { >> + return true; >> + } else if (dev->nr_zones_imp_open && null_room_in_active(dev)) { >> + null_close_first_imp_zone(dev); >> + return true; >> + } else { > > else is not needed here. > >> + return false; >> + } >> + default: >> + /* Should never be called for other states */ >> + WARN_ON(1); >> + return false; >> + } >> +} >> + >> static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, >> unsigned int nr_sectors, bool append) >> { >> @@ -177,43 +290,155 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, >> /* Cannot write to a full zone */ >> return BLK_STS_IOERR; >> case BLK_ZONE_COND_EMPTY: >> + case BLK_ZONE_COND_CLOSED: >> + if (!null_manage_zone_resources(dev, zone)) >> + return BLK_STS_IOERR; >> + break; >> case BLK_ZONE_COND_IMP_OPEN: >> case BLK_ZONE_COND_EXP_OPEN: >> + break; >> + default: >> + /* Invalid zone condition */ >> + return BLK_STS_IOERR; >> + } >> + >> + /* >> + * Regular writes must be at the write pointer position. >> + * Zone append writes are automatically issued at the write >> + * pointer and the position returned using the request or BIO >> + * sector. >> + */ >> + if (append) { >> + sector = zone->wp; >> + if (cmd->bio) >> + cmd->bio->bi_iter.bi_sector = sector; >> + else >> + cmd->rq->__sector = sector; >> + } else if (sector != zone->wp) { >> + return BLK_STS_IOERR; >> + } >> + >> + if (zone->wp + nr_sectors > zone->start + zone->capacity) >> + return BLK_STS_IOERR; >> + >> + if (zone->cond == BLK_ZONE_COND_CLOSED) { >> + dev->nr_zones_closed--; >> + dev->nr_zones_imp_open++; >> + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { >> + dev->nr_zones_imp_open++; >> + } >> + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) >> + zone->cond = BLK_ZONE_COND_IMP_OPEN; >> + >> + ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); >> + if (ret != BLK_STS_OK) >> + return ret; >> + >> + zone->wp += nr_sectors; >> + if (zone->wp == zone->start + zone->capacity) { >> + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) >> + dev->nr_zones_exp_open--; >> + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) >> + dev->nr_zones_imp_open--; >> + zone->cond = BLK_ZONE_COND_FULL; >> + } >> + return BLK_STS_OK; >> +} >> + >> +static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) >> +{ >> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) >> + return BLK_STS_IOERR; >> + >> + switch (zone->cond) { >> + case BLK_ZONE_COND_EXP_OPEN: >> + /* open operation on exp open is not an error */ >> + return BLK_STS_OK; >> + case BLK_ZONE_COND_EMPTY: >> + if (!null_manage_zone_resources(dev, zone)) >> + return BLK_STS_IOERR; >> + break; >> + case BLK_ZONE_COND_IMP_OPEN: >> + dev->nr_zones_imp_open--; >> + break; >> case BLK_ZONE_COND_CLOSED: >> - /* >> - * Regular writes must be at the write pointer position. >> - * Zone append writes are automatically issued at the write >> - * pointer and the position returned using the request or BIO >> - * sector. >> - */ >> - if (append) { >> - sector = zone->wp; >> - if (cmd->bio) >> - cmd->bio->bi_iter.bi_sector = sector; >> - else >> - cmd->rq->__sector = sector; >> - } else if (sector != zone->wp) { >> + if (!null_manage_zone_resources(dev, zone)) >> return BLK_STS_IOERR; >> - } >> + dev->nr_zones_closed--; >> + break; >> + case BLK_ZONE_COND_FULL: >> + default: >> + return BLK_STS_IOERR; >> + } >> + >> + zone->cond = BLK_ZONE_COND_EXP_OPEN; >> + dev->nr_zones_exp_open++; >> + >> + return BLK_STS_OK; >> +} >> + >> +static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) >> +{ >> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) >> + return BLK_STS_IOERR; >> >> - if (zone->wp + nr_sectors > zone->start + zone->capacity) >> + switch (zone->cond) { >> + case BLK_ZONE_COND_FULL: >> + /* finish operation on full is not an error */ >> + return BLK_STS_OK; >> + case BLK_ZONE_COND_EMPTY: >> + if (!null_manage_zone_resources(dev, zone)) >> + return BLK_STS_IOERR; >> + break; >> + case BLK_ZONE_COND_IMP_OPEN: >> + dev->nr_zones_imp_open--; >> + break; >> + case BLK_ZONE_COND_EXP_OPEN: >> + dev->nr_zones_exp_open--; >> + break; >> + case BLK_ZONE_COND_CLOSED: >> + if (!null_manage_zone_resources(dev, zone)) >> return BLK_STS_IOERR; >> + dev->nr_zones_closed--; >> + break; >> + default: >> + return BLK_STS_IOERR; >> + } >> >> - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) >> - zone->cond = BLK_ZONE_COND_IMP_OPEN; >> + zone->cond = BLK_ZONE_COND_FULL; >> + zone->wp = zone->len; > > zone->wp = zone=>start + zone->len; > >> >> - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); >> - if (ret != BLK_STS_OK) >> - return ret; >> + return BLK_STS_OK; >> +} >> + >> +static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone) >> +{ >> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) >> + return BLK_STS_IOERR; >> >> - zone->wp += nr_sectors; >> - if (zone->wp == zone->start + zone->capacity) >> - zone->cond = BLK_ZONE_COND_FULL; >> + switch (zone->cond) { >> + case BLK_ZONE_COND_EMPTY: >> + /* reset operation on empty is not an error */ >> return BLK_STS_OK; >> + case BLK_ZONE_COND_IMP_OPEN: >> + dev->nr_zones_imp_open--; >> + break; >> + case BLK_ZONE_COND_EXP_OPEN: >> + dev->nr_zones_exp_open--; >> + break; >> + case BLK_ZONE_COND_CLOSED: >> + dev->nr_zones_closed--; >> + break; >> + case BLK_ZONE_COND_FULL: >> + break; >> default: >> - /* Invalid zone condition */ >> return BLK_STS_IOERR; >> } >> + >> + zone->cond = BLK_ZONE_COND_EMPTY; >> + zone->wp = zone->start; >> + >> + return BLK_STS_OK; >> } >> >> static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, >> @@ -222,49 +447,34 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, >> struct nullb_device *dev = cmd->nq->dev; >> unsigned int zone_no = null_zone_no(dev, sector); >> struct blk_zone *zone = &dev->zones[zone_no]; >> + blk_status_t ret; >> size_t i; >> >> switch (op) { >> case REQ_OP_ZONE_RESET_ALL: >> for (i = 0; i < dev->nr_zones; i++) { >> - if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL) >> - continue; >> - zone[i].cond = BLK_ZONE_COND_EMPTY; >> - zone[i].wp = zone[i].start; >> + null_reset_zone(dev, &dev->zones[i]); >> } > > You can drop the curly brackets too, and start the loop from nr_conv_zones too. > >> break; >> case REQ_OP_ZONE_RESET: >> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) >> - return BLK_STS_IOERR; >> - >> - zone->cond = BLK_ZONE_COND_EMPTY; >> - zone->wp = zone->start; >> + ret = null_reset_zone(dev, zone); >> + if (ret != BLK_STS_OK) >> + return ret; > > You can return directly here: > > return null_reset_zone(dev, zone); Arg. No, you can't. There is the trace call after the switch. So please ignore this comment :) But you can still avoid repeating the "if (ret != BLK_STS_OK) return ret;" by calling the trace only for BLK_STS_OK and returning ret at the end. > >> break; >> case REQ_OP_ZONE_OPEN: >> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) >> - return BLK_STS_IOERR; >> - if (zone->cond == BLK_ZONE_COND_FULL) >> - return BLK_STS_IOERR; >> - >> - zone->cond = BLK_ZONE_COND_EXP_OPEN; >> + ret = null_open_zone(dev, zone); >> + if (ret != BLK_STS_OK) >> + return ret; > > same here. > >> break; >> case REQ_OP_ZONE_CLOSE: >> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) >> - return BLK_STS_IOERR; >> - if (zone->cond == BLK_ZONE_COND_FULL) >> - return BLK_STS_IOERR; >> - >> - if (zone->wp == zone->start) >> - zone->cond = BLK_ZONE_COND_EMPTY; >> - else >> - zone->cond = BLK_ZONE_COND_CLOSED; >> + ret = null_close_zone(dev, zone); >> + if (ret != BLK_STS_OK) >> + return ret; > > And here. > >> break; >> case REQ_OP_ZONE_FINISH: >> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) >> - return BLK_STS_IOERR; >> - >> - zone->cond = BLK_ZONE_COND_FULL; >> - zone->wp = zone->start + zone->len; >> + ret = null_finish_zone(dev, zone); >> + if (ret != BLK_STS_OK) >> + return ret; > > And here too. > >> break; >> default: >> return BLK_STS_NOTSUPP; >> > > -- Damien Le Moal Western Digital Research