Re: [PATCH] null_blk: add support for max open/active zone limit for zoned devices

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2020/08/26 7:52, Damien Le Moal wrote:
> On 2020/08/25 21:22, Niklas Cassel wrote:
>> Add support for user space to set a max open zone and a max active zone
>> limit via configfs. By default, the default value is 0 == no limit.
> 
> s/value is/values are/
> 
>>
>> Call the block layer API functions used for exposing the configured
>> limits to sysfs.
>>
>> Add accounting in null_blk_zoned so that these new limits are respected.
>> Performing an operating that would exceed these limits results is a
> 
> s/is a/in a/
> 
>> standard I/O error.
>>
>> A max open zone limit exists in the ZBC standard.
>> While null_blk_zoned is used to test the Zoned Block Device model in
>> Linux, when it comes to differences between ZBC and ZNS, null_blk_zoned
>> mostly follows ZBC.
>>
>> Therefore, implement the manage open zone resources function from ZBC,
>> but additionally add support for max active zones.
>> This enables user space not only to test against a device with an open
>> zone limit, but also to test against a device with an active zone limit.
>>
>> Signed-off-by: Niklas Cassel <niklas.cassel@xxxxxxx>
>> ---
>>  drivers/block/null_blk.h       |   5 +
>>  drivers/block/null_blk_main.c  |  16 +-
>>  drivers/block/null_blk_zoned.c | 316 +++++++++++++++++++++++++++------
>>  3 files changed, 283 insertions(+), 54 deletions(-)
>>
>> diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
>> index daed4a9c34367..d2e7db43a52a7 100644
>> --- a/drivers/block/null_blk.h
>> +++ b/drivers/block/null_blk.h
>> @@ -42,6 +42,9 @@ struct nullb_device {
>>  	struct badblocks badblocks;
>>  
>>  	unsigned int nr_zones;
>> +	unsigned int nr_zones_imp_open;
>> +	unsigned int nr_zones_exp_open;
>> +	unsigned int nr_zones_closed;
>>  	struct blk_zone *zones;
>>  	sector_t zone_size_sects;
>>  
>> @@ -51,6 +54,8 @@ struct nullb_device {
>>  	unsigned long zone_size; /* zone size in MB if device is zoned */
>>  	unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
>>  	unsigned int zone_nr_conv; /* number of conventional zones */
>> +	unsigned int zone_max_open; /* max number of open zones */
>> +	unsigned int zone_max_active; /* max number of active zones */
>>  	unsigned int submit_queues; /* number of submission queues */
>>  	unsigned int home_node; /* home node for the device */
>>  	unsigned int queue_mode; /* block interface */
>> diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
>> index d74443a9c8fa2..53161a418611b 100644
>> --- a/drivers/block/null_blk_main.c
>> +++ b/drivers/block/null_blk_main.c
>> @@ -208,6 +208,14 @@ static unsigned int g_zone_nr_conv;
>>  module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
>>  MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
>>  
>> +static unsigned int g_zone_max_open;
>> +module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
>> +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");
>> +
>> +static unsigned int g_zone_max_active;
>> +module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
>> +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
>> +
>>  static struct nullb_device *null_alloc_dev(void);
>>  static void null_free_dev(struct nullb_device *dev);
>>  static void null_del_dev(struct nullb *nullb);
>> @@ -347,6 +355,8 @@ NULLB_DEVICE_ATTR(zoned, bool, NULL);
>>  NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
>>  NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
>>  NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
>> +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
>> +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
>>  
>>  static ssize_t nullb_device_power_show(struct config_item *item, char *page)
>>  {
>> @@ -464,6 +474,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
>>  	&nullb_device_attr_zone_size,
>>  	&nullb_device_attr_zone_capacity,
>>  	&nullb_device_attr_zone_nr_conv,
>> +	&nullb_device_attr_zone_max_open,
>> +	&nullb_device_attr_zone_max_active,
>>  	NULL,
>>  };
>>  
>> @@ -517,7 +529,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
>>  static ssize_t memb_group_features_show(struct config_item *item, char *page)
>>  {
>>  	return snprintf(page, PAGE_SIZE,
>> -			"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n");
>> +			"memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n");
>>  }
>>  
>>  CONFIGFS_ATTR_RO(memb_group_, features);
>> @@ -580,6 +592,8 @@ static struct nullb_device *null_alloc_dev(void)
>>  	dev->zone_size = g_zone_size;
>>  	dev->zone_capacity = g_zone_capacity;
>>  	dev->zone_nr_conv = g_zone_nr_conv;
>> +	dev->zone_max_open = g_zone_max_open;
>> +	dev->zone_max_active = g_zone_max_active;
>>  	return dev;
>>  }
>>  
>> diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
>> index 3d25c9ad23831..5fb38c9bdd4ae 100644
>> --- a/drivers/block/null_blk_zoned.c
>> +++ b/drivers/block/null_blk_zoned.c
>> @@ -51,6 +51,24 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
>>  			dev->zone_nr_conv);
>>  	}
>>  
>> +	/* Max active zones has to be <= number of sequential zones */
>> +	if (dev->zone_max_active > dev->nr_zones - dev->zone_nr_conv) {
>> +		dev->zone_max_active = dev->nr_zones - dev->zone_nr_conv;
>> +		pr_info("changed the maximum number of active zones to %u\n",
>> +			dev->nr_zones);
>> +	}
> 
> if dev->zone_max_active == dev->nr_zones - dev->zone_nr_conv, you could also
> change dev->zone_max_active to 0, since that is equivalent. Not a blocker
> though, I think.
> 
>> +
>> +	/* Max open zones has to be <= max active zones */
>> +	if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) {
>> +		dev->zone_max_open = dev->zone_max_active;
>> +		pr_info("changed the maximum number of open zones to %u\n",
>> +			dev->nr_zones);
>> +	} else if (dev->zone_max_open > dev->nr_zones - dev->zone_nr_conv) {
>> +		dev->zone_max_open = dev->nr_zones - dev->zone_nr_conv;
>> +		pr_info("changed the maximum number of open zones to %u\n",
>> +			dev->nr_zones);
>> +	}
> 
> And same here for zone_max_open.
> 
>> +
>>  	for (i = 0; i <  dev->zone_nr_conv; i++) {
>>  		struct blk_zone *zone = &dev->zones[i];
>>  
>> @@ -99,6 +117,8 @@ int null_register_zoned_dev(struct nullb *nullb)
>>  	}
>>  
>>  	blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
>> +	blk_queue_max_open_zones(q, dev->zone_max_open);
>> +	blk_queue_max_active_zones(q, dev->zone_max_active);
>>  
>>  	return 0;
>>  }
>> @@ -159,6 +179,99 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
>>  	return (zone->wp - sector) << SECTOR_SHIFT;
>>  }
>>  
>> +static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> +	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> +		return BLK_STS_IOERR;
>> +
>> +	switch (zone->cond) {
>> +	case BLK_ZONE_COND_CLOSED:
>> +		/* close operation on closed is not an error */
>> +		return BLK_STS_OK;
>> +	case BLK_ZONE_COND_IMP_OPEN:
>> +		dev->nr_zones_imp_open--;
>> +		break;
>> +	case BLK_ZONE_COND_EXP_OPEN:
>> +		dev->nr_zones_exp_open--;
>> +		break;
>> +	case BLK_ZONE_COND_EMPTY:
>> +	case BLK_ZONE_COND_FULL:
>> +	default:
>> +		return BLK_STS_IOERR;
>> +	}
>> +
>> +	if (zone->wp == zone->start) {
>> +		zone->cond = BLK_ZONE_COND_EMPTY;
>> +	} else {
>> +		zone->cond = BLK_ZONE_COND_CLOSED;
>> +		dev->nr_zones_closed++;
>> +	}
>> +
>> +	return BLK_STS_OK;
>> +}
>> +
>> +static void null_close_first_imp_zone(struct nullb_device *dev)
>> +{
>> +	unsigned int i;
>> +
>> +	for (i = 0; i < dev->nr_zones; i++) {
> 
> You can start the loop from dev->nr_conv_zones, that will avoid the first if below.
> 
>> +		if (dev->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
>> +			continue;
>> +		if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) {
>> +			null_close_zone(dev, &dev->zones[i]);
>> +			return;
>> +		}
>> +	}
>> +}
>> +
>> +static bool null_room_in_active(struct nullb_device *dev)
> 
> The name is a little strange. What about null_can_set_active() ?
> 
>> +{
>> +	if (!dev->zone_max_active)
>> +		return true;
>> +
>> +	return dev->nr_zones_exp_open + dev->nr_zones_imp_open +
>> +	       dev->nr_zones_closed < dev->zone_max_active;
>> +}
>> +
>> +static bool null_room_in_open(struct nullb_device *dev)
> 
> Same here: null_can_open() ?
> 
>> +{
>> +	if (!dev->zone_max_open)
>> +		return true;
>> +
>> +	return dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open;
>> +}
>> +
>> +/*
>> + * This function matches the manage open zone resources function in the ZBC standard,
>> + * with the addition of max active zones support (added in the ZNS standard).
>> + *
>> + * ZBC states that an implicitly open zone shall be closed only if there is not
>> + * room within the open limit. However, if an active limit is set, it is not certain
>> + * that an implicitly opened zone can be closed without exceeding the active limit.
> 
> Hu... imp open and close both being an active state, closing an imp open zone
> will never result in exceeding the max active limit. Did you mean something like
> "closing an imp open zone to open another one" ? or something else ? This needs
> clarification as the function name does not really tell us what this is trying
> to do. It looks like this is checking if a zone can be open... So why not merge
> that into null_room_in_open(), renamed null_can_open() ?
> 
>> + */
>> +static bool null_manage_zone_resources(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> +	switch (zone->cond) {
>> +	case BLK_ZONE_COND_EMPTY:
>> +		if (!null_room_in_active(dev))
>> +			return false;
>> +		fallthrough;
>> +	case BLK_ZONE_COND_CLOSED:
>> +		if (null_room_in_open(dev)) {
>> +			return true;
>> +		} else if (dev->nr_zones_imp_open && null_room_in_active(dev)) {
>> +			null_close_first_imp_zone(dev);
>> +			return true;
>> +		} else {
> 
> else is not needed here.
> 
>> +			return false;
>> +		}
>> +	default:
>> +		/* Should never be called for other states */
>> +		WARN_ON(1);
>> +		return false;
>> +	}
>> +}
>> +
>>  static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
>>  				    unsigned int nr_sectors, bool append)
>>  {
>> @@ -177,43 +290,155 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
>>  		/* Cannot write to a full zone */
>>  		return BLK_STS_IOERR;
>>  	case BLK_ZONE_COND_EMPTY:
>> +	case BLK_ZONE_COND_CLOSED:
>> +		if (!null_manage_zone_resources(dev, zone))
>> +			return BLK_STS_IOERR;
>> +		break;
>>  	case BLK_ZONE_COND_IMP_OPEN:
>>  	case BLK_ZONE_COND_EXP_OPEN:
>> +		break;
>> +	default:
>> +		/* Invalid zone condition */
>> +		return BLK_STS_IOERR;
>> +	}
>> +
>> +	/*
>> +	 * Regular writes must be at the write pointer position.
>> +	 * Zone append writes are automatically issued at the write
>> +	 * pointer and the position returned using the request or BIO
>> +	 * sector.
>> +	 */
>> +	if (append) {
>> +		sector = zone->wp;
>> +		if (cmd->bio)
>> +			cmd->bio->bi_iter.bi_sector = sector;
>> +		else
>> +			cmd->rq->__sector = sector;
>> +	} else if (sector != zone->wp) {
>> +		return BLK_STS_IOERR;
>> +	}
>> +
>> +	if (zone->wp + nr_sectors > zone->start + zone->capacity)
>> +		return BLK_STS_IOERR;
>> +
>> +	if (zone->cond == BLK_ZONE_COND_CLOSED) {
>> +		dev->nr_zones_closed--;
>> +		dev->nr_zones_imp_open++;
>> +	} else if (zone->cond == BLK_ZONE_COND_EMPTY) {
>> +		dev->nr_zones_imp_open++;
>> +	}
>> +	if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
>> +		zone->cond = BLK_ZONE_COND_IMP_OPEN;
>> +
>> +	ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
>> +	if (ret != BLK_STS_OK)
>> +		return ret;
>> +
>> +	zone->wp += nr_sectors;
>> +	if (zone->wp == zone->start + zone->capacity) {
>> +		if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
>> +			dev->nr_zones_exp_open--;
>> +		else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
>> +			dev->nr_zones_imp_open--;
>> +		zone->cond = BLK_ZONE_COND_FULL;
>> +	}
>> +	return BLK_STS_OK;
>> +}
>> +
>> +static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> +	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> +		return BLK_STS_IOERR;
>> +
>> +	switch (zone->cond) {
>> +	case BLK_ZONE_COND_EXP_OPEN:
>> +		/* open operation on exp open is not an error */
>> +		return BLK_STS_OK;
>> +	case BLK_ZONE_COND_EMPTY:
>> +		if (!null_manage_zone_resources(dev, zone))
>> +			return BLK_STS_IOERR;
>> +		break;
>> +	case BLK_ZONE_COND_IMP_OPEN:
>> +		dev->nr_zones_imp_open--;
>> +		break;
>>  	case BLK_ZONE_COND_CLOSED:
>> -		/*
>> -		 * Regular writes must be at the write pointer position.
>> -		 * Zone append writes are automatically issued at the write
>> -		 * pointer and the position returned using the request or BIO
>> -		 * sector.
>> -		 */
>> -		if (append) {
>> -			sector = zone->wp;
>> -			if (cmd->bio)
>> -				cmd->bio->bi_iter.bi_sector = sector;
>> -			else
>> -				cmd->rq->__sector = sector;
>> -		} else if (sector != zone->wp) {
>> +		if (!null_manage_zone_resources(dev, zone))
>>  			return BLK_STS_IOERR;
>> -		}
>> +		dev->nr_zones_closed--;
>> +		break;
>> +	case BLK_ZONE_COND_FULL:
>> +	default:
>> +		return BLK_STS_IOERR;
>> +	}
>> +
>> +	zone->cond = BLK_ZONE_COND_EXP_OPEN;
>> +	dev->nr_zones_exp_open++;
>> +
>> +	return BLK_STS_OK;
>> +}
>> +
>> +static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> +	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> +		return BLK_STS_IOERR;
>>  
>> -		if (zone->wp + nr_sectors > zone->start + zone->capacity)
>> +	switch (zone->cond) {
>> +	case BLK_ZONE_COND_FULL:
>> +		/* finish operation on full is not an error */
>> +		return BLK_STS_OK;
>> +	case BLK_ZONE_COND_EMPTY:
>> +		if (!null_manage_zone_resources(dev, zone))
>> +			return BLK_STS_IOERR;
>> +		break;
>> +	case BLK_ZONE_COND_IMP_OPEN:
>> +		dev->nr_zones_imp_open--;
>> +		break;
>> +	case BLK_ZONE_COND_EXP_OPEN:
>> +		dev->nr_zones_exp_open--;
>> +		break;
>> +	case BLK_ZONE_COND_CLOSED:
>> +		if (!null_manage_zone_resources(dev, zone))
>>  			return BLK_STS_IOERR;
>> +		dev->nr_zones_closed--;
>> +		break;
>> +	default:
>> +		return BLK_STS_IOERR;
>> +	}
>>  
>> -		if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
>> -			zone->cond = BLK_ZONE_COND_IMP_OPEN;
>> +	zone->cond = BLK_ZONE_COND_FULL;
>> +	zone->wp = zone->len;
> 
> zone->wp = zone=>start + zone->len;
> 
>>  
>> -		ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
>> -		if (ret != BLK_STS_OK)
>> -			return ret;
>> +	return BLK_STS_OK;
>> +}
>> +
>> +static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> +	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> +		return BLK_STS_IOERR;
>>  
>> -		zone->wp += nr_sectors;
>> -		if (zone->wp == zone->start + zone->capacity)
>> -			zone->cond = BLK_ZONE_COND_FULL;
>> +	switch (zone->cond) {
>> +	case BLK_ZONE_COND_EMPTY:
>> +		/* reset operation on empty is not an error */
>>  		return BLK_STS_OK;
>> +	case BLK_ZONE_COND_IMP_OPEN:
>> +		dev->nr_zones_imp_open--;
>> +		break;
>> +	case BLK_ZONE_COND_EXP_OPEN:
>> +		dev->nr_zones_exp_open--;
>> +		break;
>> +	case BLK_ZONE_COND_CLOSED:
>> +		dev->nr_zones_closed--;
>> +		break;
>> +	case BLK_ZONE_COND_FULL:
>> +		break;
>>  	default:
>> -		/* Invalid zone condition */
>>  		return BLK_STS_IOERR;
>>  	}
>> +
>> +	zone->cond = BLK_ZONE_COND_EMPTY;
>> +	zone->wp = zone->start;
>> +
>> +	return BLK_STS_OK;
>>  }
>>  
>>  static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
>> @@ -222,49 +447,34 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
>>  	struct nullb_device *dev = cmd->nq->dev;
>>  	unsigned int zone_no = null_zone_no(dev, sector);
>>  	struct blk_zone *zone = &dev->zones[zone_no];
>> +	blk_status_t ret;
>>  	size_t i;
>>  
>>  	switch (op) {
>>  	case REQ_OP_ZONE_RESET_ALL:
>>  		for (i = 0; i < dev->nr_zones; i++) {
>> -			if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
>> -				continue;
>> -			zone[i].cond = BLK_ZONE_COND_EMPTY;
>> -			zone[i].wp = zone[i].start;
>> +			null_reset_zone(dev, &dev->zones[i]);
>>  		}
> 
> You can drop the curly brackets too, and start the loop from nr_conv_zones too.
> 
>>  		break;
>>  	case REQ_OP_ZONE_RESET:
>> -		if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> -			return BLK_STS_IOERR;
>> -
>> -		zone->cond = BLK_ZONE_COND_EMPTY;
>> -		zone->wp = zone->start;
>> +		ret = null_reset_zone(dev, zone);
>> +		if (ret != BLK_STS_OK)
>> +			return ret;
> 
> You can return directly here:
> 
> return null_reset_zone(dev, zone);

Arg. No, you can't. There is the trace call after the switch. So please ignore
this comment :)

But you can still avoid repeating the "if (ret != BLK_STS_OK) return ret;" by
calling the trace only for BLK_STS_OK and returning ret at the end.

> 
>>  		break;
>>  	case REQ_OP_ZONE_OPEN:
>> -		if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> -			return BLK_STS_IOERR;
>> -		if (zone->cond == BLK_ZONE_COND_FULL)
>> -			return BLK_STS_IOERR;
>> -
>> -		zone->cond = BLK_ZONE_COND_EXP_OPEN;
>> +		ret = null_open_zone(dev, zone);
>> +		if (ret != BLK_STS_OK)
>> +			return ret;
> 
> same here.
> 
>>  		break;
>>  	case REQ_OP_ZONE_CLOSE:
>> -		if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> -			return BLK_STS_IOERR;
>> -		if (zone->cond == BLK_ZONE_COND_FULL)
>> -			return BLK_STS_IOERR;
>> -
>> -		if (zone->wp == zone->start)
>> -			zone->cond = BLK_ZONE_COND_EMPTY;
>> -		else
>> -			zone->cond = BLK_ZONE_COND_CLOSED;
>> +		ret = null_close_zone(dev, zone);
>> +		if (ret != BLK_STS_OK)
>> +			return ret;
> 
> And here.
> 
>>  		break;
>>  	case REQ_OP_ZONE_FINISH:
>> -		if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> -			return BLK_STS_IOERR;
>> -
>> -		zone->cond = BLK_ZONE_COND_FULL;
>> -		zone->wp = zone->start + zone->len;
>> +		ret = null_finish_zone(dev, zone);
>> +		if (ret != BLK_STS_OK)
>> +			return ret;
> 
> And here too.
> 
>>  		break;
>>  	default:
>>  		return BLK_STS_NOTSUPP;
>>
> 
> 


-- 
Damien Le Moal
Western Digital Research




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux