On Sun, Oct 30, 2022 at 12:35:45AM -0400, Dmitry Fomichev wrote: > This patch adds support for Zoned Block Devices (ZBDs) to the kernel > virtio-blk driver. > > The patch accompanies the virtio-blk ZBD support draft that is now > being proposed for standardization. The latest version of the draft is > linked at > > https://github.com/oasis-tcs/virtio-spec/issues/143 . > > The QEMU zoned device code that implements these protocol extensions > has been developed by Sam Li and it is currently in review at the QEMU > mailing list. > > A number of virtblk request structure changes has been introduced to > accommodate the functionality that is specific to zoned block devices > and, most importantly, make room for carrying the Zoned Append sector > value from the device back to the driver along with the request status. > > The zone-specific code in the patch is heavily influenced by NVMe ZNS > code in drivers/nvme/host/zns.c, but it is simpler because the proposed > virtio ZBD draft only covers the zoned device features that are > relevant to the zoned functionality provided by Linux block layer. > > Co-developed-by: Stefan Hajnoczi <stefanha@xxxxxxxxx> > Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxx> > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@xxxxxxx> > --- > drivers/block/virtio_blk.c | 438 ++++++++++++++++++++++++++++++-- > include/uapi/linux/virtio_blk.h | 105 ++++++++ > 2 files changed, 524 insertions(+), 19 deletions(-) > > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > index 3efe3da5f8c2..03b5302fac6e 100644 > --- a/drivers/block/virtio_blk.c > +++ b/drivers/block/virtio_blk.c > @@ -15,6 +15,7 @@ > #include <linux/blk-mq.h> > #include <linux/blk-mq-virtio.h> > #include <linux/numa.h> > +#include <linux/vmalloc.h> > #include <uapi/linux/virtio_ring.h> > > #define PART_BITS 4 > @@ -80,22 +81,51 @@ struct virtio_blk { > int num_vqs; > int io_queues[HCTX_MAX_TYPES]; > struct virtio_blk_vq *vqs; > + > + /* For zoned device */ > + unsigned int zone_sectors; > }; > > struct virtblk_req { > + /* Out header */ > struct virtio_blk_outhdr out_hdr; > - u8 status; > + > + /* In header */ > + union { > + struct { > + u8 status; > + } common; > + > + /* > + * The zone append command has an extended in header. > + * The status field in zone_append_in_hdr must always > + * be the last byte. > + */ > + struct { > + __virtio64 sector; > + u8 status; > + } zone_append; > + } in_hdr; > + > + size_t in_hdr_len; > + > struct sg_table sg_table; > struct scatterlist sg[]; > }; > > -static inline blk_status_t virtblk_result(struct virtblk_req *vbr) > +static inline blk_status_t virtblk_result(u8 status) > { > - switch (vbr->status) { > + switch (status) { > case VIRTIO_BLK_S_OK: > return BLK_STS_OK; > case VIRTIO_BLK_S_UNSUPP: > return BLK_STS_NOTSUPP; > + case VIRTIO_BLK_S_ZONE_OPEN_RESOURCE: > + return BLK_STS_ZONE_OPEN_RESOURCE; > + case VIRTIO_BLK_S_ZONE_ACTIVE_RESOURCE: > + return BLK_STS_ZONE_ACTIVE_RESOURCE; > + case VIRTIO_BLK_S_IOERR: > + case VIRTIO_BLK_S_ZONE_UNALIGNED_WP: > default: > return BLK_STS_IOERR; > } > @@ -111,11 +141,11 @@ static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx > > static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) > { > - struct scatterlist hdr, status, *sgs[3]; > + struct scatterlist out_hdr, in_hdr, *sgs[3]; > unsigned int num_out = 0, num_in = 0; > > - sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); > - sgs[num_out++] = &hdr; > + sg_init_one(&out_hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); > + sgs[num_out++] = &out_hdr; > > if (vbr->sg_table.nents) { > if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) > @@ -124,8 +154,8 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) > sgs[num_out + num_in++] = vbr->sg_table.sgl; > } > > - sg_init_one(&status, &vbr->status, sizeof(vbr->status)); > - sgs[num_out + num_in++] = &status; > + sg_init_one(&in_hdr, &vbr->in_hdr.common.status, vbr->in_hdr_len); > + sgs[num_out + num_in++] = &in_hdr; > > return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); > } > @@ -214,21 +244,22 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, > struct request *req, > struct virtblk_req *vbr) > { > + size_t in_hdr_len = sizeof(vbr->in_hdr.common.status); > bool unmap = false; > u32 type; > + u64 sector = 0; > > - vbr->out_hdr.sector = 0; > + /* Set fields for all request types */ > + vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); > > switch (req_op(req)) { > case REQ_OP_READ: > type = VIRTIO_BLK_T_IN; > - vbr->out_hdr.sector = cpu_to_virtio64(vdev, > - blk_rq_pos(req)); > + sector = blk_rq_pos(req); > break; > case REQ_OP_WRITE: > type = VIRTIO_BLK_T_OUT; > - vbr->out_hdr.sector = cpu_to_virtio64(vdev, > - blk_rq_pos(req)); > + sector = blk_rq_pos(req); > break; > case REQ_OP_FLUSH: > type = VIRTIO_BLK_T_FLUSH; > @@ -243,16 +274,42 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, > case REQ_OP_SECURE_ERASE: > type = VIRTIO_BLK_T_SECURE_ERASE; > break; > - case REQ_OP_DRV_IN: > - type = VIRTIO_BLK_T_GET_ID; > + case REQ_OP_ZONE_OPEN: > + type = VIRTIO_BLK_T_ZONE_OPEN; > + sector = blk_rq_pos(req); > break; > + case REQ_OP_ZONE_CLOSE: > + type = VIRTIO_BLK_T_ZONE_CLOSE; > + sector = blk_rq_pos(req); > + break; > + case REQ_OP_ZONE_FINISH: > + type = VIRTIO_BLK_T_ZONE_FINISH; > + sector = blk_rq_pos(req); > + break; > + case REQ_OP_ZONE_APPEND: > + type = VIRTIO_BLK_T_ZONE_APPEND; > + sector = blk_rq_pos(req); > + in_hdr_len = sizeof(vbr->in_hdr.zone_append); > + break; > + case REQ_OP_ZONE_RESET: > + type = VIRTIO_BLK_T_ZONE_RESET; > + sector = blk_rq_pos(req); > + break; > + case REQ_OP_ZONE_RESET_ALL: > + type = VIRTIO_BLK_T_ZONE_RESET_ALL; > + break; > + case REQ_OP_DRV_IN: > + /* Out header already filled in, nothing to do */ > + return 0; > default: > WARN_ON_ONCE(1); > return BLK_STS_IOERR; > } > > + /* Set fields for non-REQ_OP_DRV_IN request types */ > + vbr->in_hdr_len = in_hdr_len; > vbr->out_hdr.type = cpu_to_virtio32(vdev, type); > - vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); > + vbr->out_hdr.sector = cpu_to_virtio64(vdev, sector); > > if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES || > type == VIRTIO_BLK_T_SECURE_ERASE) { > @@ -263,13 +320,30 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, > return 0; > } > > +/* > + * The status byte is always the last byte of the virtblk request > + * in-header. This helper fetches its value for all in-header formats > + * that are currently defined. > + */ > +static inline u8 virtblk_vbr_status(struct virtblk_req *vbr) > +{ > + return *((u8 *)&vbr->in_hdr + vbr->in_hdr_len - 1); > +} > + > static inline void virtblk_request_done(struct request *req) > { > struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); > + blk_status_t status = virtblk_result(virtblk_vbr_status(vbr)); > + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; > > virtblk_unmap_data(req, vbr); > virtblk_cleanup_cmd(req); > - blk_mq_end_request(req, virtblk_result(vbr)); > + > + if (req_op(req) == REQ_OP_ZONE_APPEND) > + req->__sector = virtio64_to_cpu(vblk->vdev, > + vbr->in_hdr.zone_append.sector); > + > + blk_mq_end_request(req, status); > } > > static void virtblk_done(struct virtqueue *vq) > @@ -455,6 +529,315 @@ static void virtio_queue_rqs(struct request **rqlist) > *rqlist = requeue_list; > } > > +#ifdef CONFIG_BLK_DEV_ZONED > +static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk, > + unsigned int nr_zones, > + unsigned int zone_sectors, > + size_t *buflen) > +{ > + struct request_queue *q = vblk->disk->queue; > + size_t bufsize; > + void *buf; > + > + nr_zones = min_t(unsigned int, nr_zones, > + get_capacity(vblk->disk) >> ilog2(zone_sectors)); > + > + bufsize = sizeof(struct virtio_blk_zone_report) + > + nr_zones * sizeof(struct virtio_blk_zone_descriptor); > + bufsize = min_t(size_t, bufsize, > + queue_max_hw_sectors(q) << SECTOR_SHIFT); > + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); > + > + while (bufsize >= sizeof(struct virtio_blk_zone_report)) { > + buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); > + if (buf) { > + *buflen = bufsize; > + return buf; > + } > + bufsize >>= 1; > + } > + > + return NULL; > +} > + > +static int virtblk_submit_zone_report(struct virtio_blk *vblk, > + char *report_buf, size_t report_len, > + sector_t sector) > +{ > + struct request_queue *q = vblk->disk->queue; > + struct request *req; > + struct virtblk_req *vbr; > + int err; > + > + req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0); > + if (IS_ERR(req)) > + return PTR_ERR(req); > + > + vbr = blk_mq_rq_to_pdu(req); > + vbr->in_hdr_len = sizeof(vbr->in_hdr.common.status); > + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT); > + vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector); > + > + err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL); > + if (err) > + goto out; > + > + blk_execute_rq(req, false); > + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.common.status)); > +out: > + blk_mq_free_request(req); > + return err; > +} > + > +static int virtblk_parse_zone(struct virtio_blk *vblk, > + struct virtio_blk_zone_descriptor *entry, > + unsigned int idx, unsigned int zone_sectors, > + report_zones_cb cb, void *data) > +{ > + struct blk_zone zone = { }; > + > + zone.start = virtio64_to_cpu(vblk->vdev, entry->z_start); > + zone.capacity = virtio64_to_cpu(vblk->vdev, entry->z_cap); > + > + switch (entry->z_type) { > + case VIRTIO_BLK_ZT_SWR: > + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; > + break; > + case VIRTIO_BLK_ZT_SWP: > + zone.type = BLK_ZONE_TYPE_SEQWRITE_PREF; > + break; > + case VIRTIO_BLK_ZT_CONV: > + zone.type = BLK_ZONE_TYPE_CONVENTIONAL; > + break; > + default: > + dev_err(&vblk->vdev->dev, "zone %llu: invalid type %#x\n", > + zone.start, entry->z_type); > + return -EINVAL; > + } > + > + switch (entry->z_state) { > + case VIRTIO_BLK_ZS_EMPTY: > + zone.cond = BLK_ZONE_COND_EMPTY; > + break; > + case VIRTIO_BLK_ZS_CLOSED: > + zone.cond = BLK_ZONE_COND_CLOSED; > + break; > + case VIRTIO_BLK_ZS_FULL: > + zone.cond = BLK_ZONE_COND_FULL; > + break; > + case VIRTIO_BLK_ZS_EOPEN: > + zone.cond = BLK_ZONE_COND_EXP_OPEN; > + break; > + case VIRTIO_BLK_ZS_IOPEN: > + zone.cond = BLK_ZONE_COND_IMP_OPEN; > + break; > + case VIRTIO_BLK_ZS_NOT_WP: > + zone.cond = BLK_ZONE_COND_NOT_WP; > + break; > + case VIRTIO_BLK_ZS_RDONLY: > + zone.cond = BLK_ZONE_COND_READONLY; > + break; > + case VIRTIO_BLK_ZS_OFFLINE: > + zone.cond = BLK_ZONE_COND_OFFLINE; > + break; > + default: > + dev_err(&vblk->vdev->dev, "zone %llu: invalid condition %#x\n", > + zone.start, entry->z_state); > + return -EINVAL; > + } > + > + zone.len = zone_sectors; > + if (zone.cond == BLK_ZONE_COND_FULL) > + zone.wp = zone.start + zone.len; Is this correct on devices where the last zone != zone_sectors? Maybe it doesn't matter? > + else > + zone.wp = virtio64_to_cpu(vblk->vdev, entry->z_wp); Is a sanity check needed here? The device is not trusted and must not be able to trigger out-of-bounds accesses based on zone.wp or similar. > + > + return cb(&zone, idx, data); > +} > + > +static int virtblk_report_zones(struct gendisk *disk, sector_t sector, > + unsigned int nr_zones, report_zones_cb cb, > + void *data) > +{ > + struct virtio_blk *vblk = disk->private_data; > + struct virtio_blk_zone_report *report; > + unsigned long long nz, i; > + size_t buflen; > + unsigned int zone_sectors = vblk->zone_sectors; > + int ret, zone_idx = 0; > + > + if (WARN_ON_ONCE(!vblk->zone_sectors)) > + return -EOPNOTSUPP; > + > + report = virtblk_alloc_report_buffer(vblk, nr_zones, > + zone_sectors, &buflen); > + if (!report) > + return -ENOMEM; > + > + while (zone_idx < nr_zones && sector < get_capacity(vblk->disk)) { Why is zone_idx a signed int while nr_zones is an unsigned int? > + memset(report, 0, buflen); > + > + ret = virtblk_submit_zone_report(vblk, (char *)report, > + buflen, sector); > + if (ret) { > + if (ret > 0) > + ret = -EIO; > + goto out_free; > + } > + nz = min(virtio64_to_cpu(vblk->vdev, report->nr_zones), > + (u64)nr_zones); > + if (!nz) > + break; > + > + for (i = 0; i < nz && zone_idx < nr_zones; i++) { > + ret = virtblk_parse_zone(vblk, &report->zones[i], > + zone_idx, zone_sectors, cb, data); > + if (ret) > + goto out_free; > + > + sector = virtio64_to_cpu(vblk->vdev, > + report->zones[i].z_start) + > + zone_sectors; > + zone_idx++; > + } > + } > + > + if (zone_idx > 0) > + ret = zone_idx; > + else > + ret = -EINVAL; > +out_free: > + kvfree(report); > + return ret; > +} > + > +static void virtblk_revalidate_zones(struct virtio_blk *vblk) > +{ > + u8 model; > + > + if (!vblk->zone_sectors) > + return; > + > + virtio_cread(vblk->vdev, struct virtio_blk_config, > + zoned.model, &model); model is unused? If used, then it needs to be validated like in virtblk_probe_zoned_device(). > + if (!blk_revalidate_disk_zones(vblk->disk, NULL)) > + set_capacity_and_notify(vblk->disk, 0); > +} > + > +static int virtblk_probe_zoned_device(struct virtio_device *vdev, > + struct virtio_blk *vblk, > + struct request_queue *q) > +{ > + u32 v, wg; > + u8 model; > + int ret; > + > + virtio_cread(vdev, struct virtio_blk_config, > + zoned.model, &model); > + > + switch (model) { > + case VIRTIO_BLK_Z_NONE: > + return 0; > + case VIRTIO_BLK_Z_HM: > + break; > + case VIRTIO_BLK_Z_HA: > + /* > + * Present the host-aware device as a regular drive. > + * TODO It is possible to add an option to make it appear > + * in the system as a zoned drive. > + */ > + return 0; > + default: > + dev_err(&vdev->dev, "unsupported zone model %d\n", model); > + return -EINVAL; > + } > + > + dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); > + > + disk_set_zoned(vblk->disk, BLK_ZONED_HM); > + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); > + > + virtio_cread(vdev, struct virtio_blk_config, > + zoned.max_open_zones, &v); > + disk_set_max_open_zones(vblk->disk, v); > + > + dev_dbg(&vdev->dev, "max open zones = %u\n", v); > + > + virtio_cread(vdev, struct virtio_blk_config, > + zoned.max_active_zones, &v); > + disk_set_max_active_zones(vblk->disk, v); > + dev_dbg(&vdev->dev, "max active zones = %u\n", v); > + > + virtio_cread(vdev, struct virtio_blk_config, > + zoned.write_granularity, &wg); > + if (!wg) { > + dev_warn(&vdev->dev, "zero write granularity reported\n"); > + return -ENODEV; > + } > + blk_queue_physical_block_size(q, wg); > + blk_queue_io_min(q, wg); > + > + dev_dbg(&vdev->dev, "write granularity = %u\n", wg); > + > + /* > + * virtio ZBD specification doesn't require zones to be a power of > + * two sectors in size, but the code in this driver expects that. > + */ > + virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, > + &vblk->zone_sectors); > + if (vblk->zone_sectors == 0 || !is_power_of_2(vblk->zone_sectors)) { > + dev_err(&vdev->dev, > + "zoned device with non power of two zone size %u\n", > + vblk->zone_sectors); > + return -ENODEV; > + } > + dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors); > + > + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { > + dev_warn(&vblk->vdev->dev, > + "ignoring negotiated F_DISCARD for zoned device\n"); > + blk_queue_max_discard_sectors(q, 0); > + } > + > + ret = blk_revalidate_disk_zones(vblk->disk, NULL); > + if (!ret) { > + virtio_cread(vdev, struct virtio_blk_config, > + zoned.max_append_sectors, &v); > + if (!v) { > + dev_warn(&vdev->dev, "zero max_append_sectors reported\n"); > + return -ENODEV; > + } > + if ((v << SECTOR_SHIFT) < wg) { > + dev_err(&vdev->dev, > + "write granularity %u exceeds max_append_sectors %u limit\n", > + wg, v); > + return -ENODEV; > + } > + > + blk_queue_max_zone_append_sectors(q, v); > + dev_dbg(&vdev->dev, "max append sectors = %u\n", v); > + } > + > + return ret; > +} > + > +#else > + > +/* > + * Zoned block device support is not configured in this kernel. > + * We only need to define a few symbols to avoid compilation errors. > + */ > +#define virtblk_report_zones NULL > +static inline void virtblk_revalidate_zones(struct virtio_blk *vblk) > +{ > +} > +static inline int virtblk_probe_zoned_device(struct virtio_device *vdev, > + struct virtio_blk *vblk, struct request_queue *q) > +{ > + return -EOPNOTSUPP; > +} > +#endif /* CONFIG_BLK_DEV_ZONED */ > + > /* return id (s/n) string for *disk to *id_str > */ > static int virtblk_get_id(struct gendisk *disk, char *id_str) > @@ -462,18 +845,24 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) > struct virtio_blk *vblk = disk->private_data; > struct request_queue *q = vblk->disk->queue; > struct request *req; > + struct virtblk_req *vbr; > int err; > > req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0); > if (IS_ERR(req)) > return PTR_ERR(req); > > + vbr = blk_mq_rq_to_pdu(req); > + vbr->in_hdr_len = sizeof(vbr->in_hdr.common.status); > + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); > + vbr->out_hdr.sector = 0; > + > err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); > if (err) > goto out; > > blk_execute_rq(req, false); > - err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req))); > + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.common.status)); > out: > blk_mq_free_request(req); > return err; > @@ -524,6 +913,7 @@ static const struct block_device_operations virtblk_fops = { > .owner = THIS_MODULE, > .getgeo = virtblk_getgeo, > .free_disk = virtblk_free_disk, > + .report_zones = virtblk_report_zones, Does virtblk_report_zones() need: mutex_lock(&vblk->vdev_mutex); if (!vblk->vdev) { ret = -ENXIO; goto out; } ... mutex_unlock(&vblk->vdev_mutex); return ret; ? This is necessary when vblk->vdev is access by code that is reachable after virtblk_remove() has been called (e.g. PCI hot unplug). In this case the block device still exists and userspace processes may have file descriptors open, but the underlying virtio-blk device is gone. I think this may be necessary since ioctl(BLKREPORTZONE) can still be called after virtblk_remove()? See the vdev_mutex field doc comment for more information. Stefan
Attachment:
signature.asc
Description: PGP signature