On Mon, 2021-01-18 at 19:25 +0100, Christoph Hellwig wrote: > On Tue, Jan 12, 2021 at 07:52:27AM +0000, Damien Le Moal wrote: > > > > > > I do not understand the logic here, given that NVMe does not have > > > conventional zones. > > > > 512e SAS & SATA SMR drives (512B logical, 4K physical) are a big thing, and for > > these, all writes in sequential zones must be 4K aligned. So I suggested to > > Chaitanya to simply use the physical block size as the LBA size for the target > > to avoid weird IO errors that would not make sense in ZNS/NVMe world (e.g. 512B > > aligned write requests failing). > > But in NVMe the physical block size exposes the atomic write unit, which > could be way too large. Іf we want to do this cleanly we need to expose > a minimum sequential zone write alignment value in the block layer. What about something like this below to add to Chaitanya series ? This adds the queue limit zone_write_granularity which is set to the physical block size for scsi, and for NVMe/ZNS too since that value is limited to the atomic block size. Lightly tested with both 512e and 4kn SMR drives. diff --git a/block/blk-settings.c b/block/blk-settings.c index 43990b1d148b..d6c2677a38df 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -60,6 +60,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->io_opt = 0; lim->misaligned = 0; lim->zoned = BLK_ZONED_NONE; + lim->zone_write_granularity = 0; } EXPORT_SYMBOL(blk_set_default_limits); @@ -341,6 +342,14 @@ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); limits->max_sectors = round_down(limits->max_sectors, size >> SECTOR_SHIFT); + + if (blk_queue_is_zoned(q)) { + if (limits->zone_write_granularity < limits- >logical_block_size) + limits->zone_write_granularity = + limits->logical_block_size; + if (q->limits.zone_write_granularity < q->limits.io_min) + q->limits.zone_write_granularity = q->limits.io_min; + } } EXPORT_SYMBOL(blk_queue_logical_block_size); @@ -361,11 +370,39 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) if (q->limits.physical_block_size < q->limits.logical_block_size) q->limits.physical_block_size = q->limits.logical_block_size; - if (q->limits.io_min < q->limits.physical_block_size) + if (q->limits.io_min < q->limits.physical_block_size) { q->limits.io_min = q->limits.physical_block_size; + if (blk_queue_is_zoned(q) + && q->limits.zone_write_granularity < q->limits.io_min) + q->limits.zone_write_granularity = q->limits.io_min; + } } EXPORT_SYMBOL(blk_queue_physical_block_size); +/** + * blk_queue_zone_write_granularity - set zone write granularity for the queue + * @q: the request queue for the zoned device + * @size: the zone write granularity size, in bytes + * + * Description: + * This should be set to the lowest possible size allowing to write in + * sequential zones of a zoned block device. + */ +void blk_queue_zone_write_granularity(struct request_queue *q, unsigned int size) +{ + if (WARN_ON(!blk_queue_is_zoned(q))) + return; + + q->limits.zone_write_granularity = size; + + if (q->limits.zone_write_granularity < q->limits.logical_block_size) + q->limits.zone_write_granularity = q- >limits.logical_block_size; + + if (q->limits.zone_write_granularity < q->limits.io_min) + q->limits.zone_write_granularity = q->limits.io_min; +} +EXPORT_SYMBOL(blk_queue_zone_write_granularity); + /** * blk_queue_alignment_offset - set physical block alignment offset * @q: the request queue for the device @@ -631,6 +668,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_granularity; } + t->zone_write_granularity = max(t->zone_write_granularity, + b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); return ret; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b513f1683af0..7ea3dd4d876b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -219,6 +219,11 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } +static ssize_t queue_zone_write_granularity_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.zone_write_granularity, page); +} + static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { unsigned long long max_sectors = q->limits.max_zone_append_sectors; @@ -585,6 +590,7 @@ QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_RO_ENTRY(queue_zoned, "zoned"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); @@ -639,6 +645,7 @@ static struct attribute *queue_attrs[] = { &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, + &queue_zone_write_granularity_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 1dfe9a3500e3..def76ac88248 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -113,6 +113,13 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1); blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1); + + /* + * The physical block size is limited to the Atomic Write Unit Power + * Fail parameter. Use this value as the zone write granularity as it + * may be different from the logical block size. + */ + blk_queue_zone_write_granularity(q, q->limits.physical_block_size); free_data: kfree(id); return status; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index cf07b7f93579..41d602f7e62e 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -789,6 +789,16 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) blk_queue_max_active_zones(q, 0); nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks); + /* + * Per ZBC and ZAC specifications, writes in sequential write required + * zones of host-managed devices must be aligned to the device physical + * block size. + */ + if (blk_queue_zoned_model(q) == BLK_ZONED_HM) + blk_queue_zone_write_granularity(q, sdkp- >physical_block_size); + else + blk_queue_zone_write_granularity(q, sdkp->device- >sector_size); + /* READ16/WRITE16 is mandatory for ZBC disks */ sdkp->device->use_16_for_rw = 1; sdkp->device->use_10_for_rw = 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94ee3089e01..4b4df2644882 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -337,6 +337,7 @@ struct queue_limits { unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; + unsigned int zone_write_granularity; unsigned short max_segments; unsigned short max_integrity_segments; @@ -1161,6 +1162,7 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); extern void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); +void blk_queue_zone_write_granularity(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); void blk_queue_update_readahead(struct request_queue *q); -- Damien Le Moal Western Digital