On 2020/06/29 8:01, Matias Bjorling wrote: > The NVMe Zoned Namespace Command Set adds support for associating > data to a zone through the Zone Descriptor Extension feature. > > To allow user-space to associate data to a zone, add support through > the BLKSETDESCZONE ioctl. The ioctl requires that it is issued to > a zoned block device, and that it supports the Zone Descriptor > Extension feature. Support is detected through the > the zone_desc_ext_bytes sysfs queue entry for the specific block > device. A value larger than zero communicates that the device supports > the feature. > > The ioctl associates data to a zone by issuing a Zone Management Send > command with the Zone Send Action set as the Set Zone Descriptor > Extension. > > For the command to complete successfully, the specified zone must be > in the Empty state, and active resources must be available. On > success, the specified zone is transioned to Closed state by the > device. If less data is supplied by user-space then reported by the > the Zone Descriptor Extension size, the rest is zero-filled. If more > data or no data is supplied by user-space, the ioctl fails. > > To issue the ioctl, a new blk_zone_set_desc data structure is defined. > It has following parameters: > > * the sector of the specific zone. > * the length of the data to be associated to the zone. > * any flags be used by the ioctl. None is defined. > * data associated to the zone. > > The data is laid out after the flags parameter, and it is the caller's > responsibility to allocate memory for the data that is specified in the > length parameter. > > Signed-off-by: Matias Bjørling <matias.bjorling@xxxxxxx> > --- > block/blk-zoned.c | 108 ++++++++++++++++++++++++++++++++++ > block/ioctl.c | 2 + > drivers/nvme/host/core.c | 3 + > drivers/nvme/host/nvme.h | 9 +++ > drivers/nvme/host/zns.c | 11 ++++ > include/linux/blk_types.h | 2 + > include/linux/blkdev.h | 9 ++- > include/uapi/linux/blkzoned.h | 20 ++++++- > 8 files changed, 162 insertions(+), 2 deletions(-) > > diff --git a/block/blk-zoned.c b/block/blk-zoned.c > index 81152a260354..4dc40ec006a2 100644 > --- a/block/blk-zoned.c > +++ b/block/blk-zoned.c > @@ -259,6 +259,50 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, > } > EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); > > +/** > + * blkdev_zone_set_desc - Execute a zone management set zone descriptor > + * extension operation on a zone > + * @bdev: Target block device > + * @sector: Start sector of the zone to operate on > + * @data: Pointer to the data that is to be associated to the zone > + * @gfp_mask: Memory allocation flags (for bio_alloc) > + * > + * Description: > + * Associate zone descriptor extension data to a specified zone. > + * The block device must support zone descriptor extensions. > + * i.e., by exposing a positive zone descriptor extension size. > + */ > +int blkdev_zone_set_desc(struct block_device *bdev, sector_t sector, > + struct page *data, gfp_t gfp_mask) struct page for the data ? Why not just a "void *" to allow for kmalloc/vmalloc data ? And no length for the data ? This is a bit odd. > +{ > + struct request_queue *q = bdev_get_queue(bdev); > + sector_t zone_sectors = blk_queue_zone_sectors(q); > + struct bio_vec bio_vec; > + struct bio bio; > + > + if (!blk_queue_is_zoned(q)) > + return -EOPNOTSUPP; > + > + if (bdev_read_only(bdev)) > + return -EPERM; You are not checking the zone_desc_ext_bytes limit here. You should. > + > + /* Check alignment (handle eventual smaller last zone) */ > + if (sector & (zone_sectors - 1)) > + return -EINVAL; The comment is incorrect. There is nothing special for handling the last zone in this test. > + > + bio_init(&bio, &bio_vec, 1); > + bio.bi_opf = REQ_OP_ZONE_SET_DESC | REQ_SYNC; > + bio.bi_iter.bi_sector = sector; > + bio_set_dev(&bio, bdev); > + bio_add_page(&bio, data, queue_zone_desc_ext_bytes(q), 0); > + > + /* This may take a while, so be nice to others */ > + cond_resched(); This is not a loop, so you do not need this. > + > + return submit_bio_wait(&bio); > +} > +EXPORT_SYMBOL_GPL(blkdev_zone_set_desc); > + > struct zone_report_args { > struct blk_zone __user *zones; > }; > @@ -370,6 +414,70 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, > GFP_KERNEL); > } > > +/* > + * BLKSETDESCZONE ioctl processing. > + * Called from blkdev_ioctl. > + */ > +int blkdev_zone_set_desc_ioctl(struct block_device *bdev, fmode_t mode, > + unsigned int cmd, unsigned long arg) > +{ > + void __user *argp = (void __user *)arg; > + struct request_queue *q; > + struct blk_zone_set_desc zsd; > + void *zsd_data; > + int ret; > + > + if (!argp) > + return -EINVAL; > + > + q = bdev_get_queue(bdev); > + if (!q) > + return -ENXIO; > + > + if (!blk_queue_is_zoned(q)) > + return -ENOTTY; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EACCES; > + > + if (!(mode & FMODE_WRITE)) > + return -EBADF; > + > + if (!queue_zone_desc_ext_bytes(q)) > + return -EOPNOTSUPP; > + > + if (copy_from_user(&zsd, argp, sizeof(struct blk_zone_set_desc))) > + return -EFAULT; > + > + /* no flags is currently supported */ > + if (zsd.flags) > + return -ENOTTY; > + > + if (!zsd.len || zsd.len > queue_zone_desc_ext_bytes(q)) > + return -ENOTTY; This should go into blkdev_zone_set_desc() as well so that in-kernel users are checked. So there may be no need to check this here. > + > + /* allocate the size of the zone descriptor extension and fill > + * with the data in the user data buffer. If the data size is less > + * than the zone descriptor extension size, then the rest of the > + * zone description extension data buffer is zero-filled. > + */ > + zsd_data = (void *) get_zeroed_page(GFP_KERNEL); > + if (!zsd_data) > + return -ENOMEM; > + > + if (copy_from_user(zsd_data, argp + sizeof(struct blk_zone_set_desc), > + zsd.len)) { > + ret = -EFAULT; > + goto free; > + } > + > + ret = blkdev_zone_set_desc(bdev, zsd.sector, virt_to_page(zsd_data), > + GFP_KERNEL); > +free: > + free_page((unsigned long) zsd_data); > + return ret; > +} > + > static inline unsigned long *blk_alloc_zone_bitmap(int node, > unsigned int nr_zones) > { > diff --git a/block/ioctl.c b/block/ioctl.c > index bdb3bbb253d9..b9744705835b 100644 > --- a/block/ioctl.c > +++ b/block/ioctl.c > @@ -515,6 +515,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, > case BLKCLOSEZONE: > case BLKFINISHZONE: > return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); > + case BLKSETDESCZONE: > + return blkdev_zone_set_desc_ioctl(bdev, mode, cmd, arg); > case BLKGETZONESZ: > return put_uint(argp, bdev_zone_sectors(bdev)); > case BLKGETNRZONES: > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index e961910da4ac..b8f25b0d00ad 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -776,6 +776,9 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, > case REQ_OP_ZONE_FINISH: > ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); > break; > + case REQ_OP_ZONE_SET_DESC: > + ret = nvme_setup_zone_set_desc(ns, req, cmd); > + break; > case REQ_OP_WRITE_ZEROES: > ret = nvme_setup_write_zeroes(ns, req, cmd); > break; > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h > index 662f95fbd909..5bd5a437b038 100644 > --- a/drivers/nvme/host/nvme.h > +++ b/drivers/nvme/host/nvme.h > @@ -708,6 +708,9 @@ int nvme_report_zones(struct gendisk *disk, sector_t sector, > blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, > struct nvme_command *cmnd, > enum nvme_zone_mgmt_action action); > + > +blk_status_t nvme_setup_zone_set_desc(struct nvme_ns *ns, struct request *req, > + struct nvme_command *cmnd); > #else > #define nvme_report_zones NULL > > @@ -718,6 +721,12 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, > return BLK_STS_NOTSUPP; > } > > +static inline blk_status_t nvme_setup_zone_set_desc(struct nvme_ns *ns, > + struct request *req, struct nvme_command *cmnd) > +{ > + return BLK_STS_NOTSUPP; > +} > + > static inline int nvme_update_zone_info(struct gendisk *disk, > struct nvme_ns *ns, > unsigned lbaf) > diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c > index 5792d953a8f3..bfa64cc685d3 100644 > --- a/drivers/nvme/host/zns.c > +++ b/drivers/nvme/host/zns.c > @@ -239,3 +239,14 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, > > return BLK_STS_OK; > } > + > +blk_status_t nvme_setup_zone_set_desc(struct nvme_ns *ns, struct request *req, > + struct nvme_command *c) > +{ > + c->zms.opcode = nvme_cmd_zone_mgmt_send; > + c->zms.nsid = cpu_to_le32(ns->head->ns_id); > + c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); > + c->zms.action = NVME_ZONE_SET_DESC_EXT; > + > + return BLK_STS_OK; > +} > diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h > index ccb895f911b1..53b7b05b0004 100644 > --- a/include/linux/blk_types.h > +++ b/include/linux/blk_types.h > @@ -316,6 +316,8 @@ enum req_opf { > REQ_OP_ZONE_FINISH = 12, > /* write data at the current zone write pointer */ > REQ_OP_ZONE_APPEND = 13, > + /* associate zone desc extension data to a zone */ > + REQ_OP_ZONE_SET_DESC = 14, > > /* SCSI passthrough using struct scsi_request */ > REQ_OP_SCSI_IN = 32, > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 2ed55055f68d..c5f092dd5aa3 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -370,7 +370,8 @@ extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, > unsigned int cmd, unsigned long arg); > extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, > unsigned int cmd, unsigned long arg); > - > +extern int blkdev_zone_set_desc_ioctl(struct block_device *bdev, fmode_t mode, > + unsigned int cmd, unsigned long arg); > #else /* CONFIG_BLK_DEV_ZONED */ > > static inline unsigned int blkdev_nr_zones(struct gendisk *disk) > @@ -392,6 +393,12 @@ static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, > return -ENOTTY; > } > > +static inline int blkdev_zone_set_desc_ioctl(struct block_device *bdev, > + fmode_t mode, unsigned int cmd, > + unsigned long arg) > +{ > + return -ENOTTY; > +} > #endif /* CONFIG_BLK_DEV_ZONED */ > > struct request_queue { > diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h > index 42c3366cc25f..68abda9abf33 100644 > --- a/include/uapi/linux/blkzoned.h > +++ b/include/uapi/linux/blkzoned.h > @@ -142,6 +142,20 @@ struct blk_zone_range { > __u64 nr_sectors; > }; > > +/** > + * struct blk_zone_set_desc - BLKSETDESCZONE ioctl requests > + * @sector: Starting sector of the zone to operate on. > + * @flags: Feature flags. > + * @len: size, in bytes, of the data to be associated to the zone. > + * @data: data to be associated. > + */ > +struct blk_zone_set_desc { > + __u64 sector; > + __u32 flags; > + __u32 len; > + __u8 data[0]; > +}; > + > /** > * Zoned block device ioctl's: > * > @@ -158,6 +172,10 @@ struct blk_zone_range { > * The 512 B sector range must be zone aligned. > * @BLKFINISHZONE: Mark the zones as full in the specified sector range. > * The 512 B sector range must be zone aligned. > + * @BLKSETDESCZONE: Set zone description extension data for the zone > + * in the specified sector. On success, the zone > + * will transition to the closed zone state. > + * The 512B sector must be zone aligned. > */ > #define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) > #define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) > @@ -166,5 +184,5 @@ struct blk_zone_range { > #define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) > #define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) > #define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) > - > +#define BLKSETDESCZONE _IOW(0x12, 137, struct blk_zone_set_desc) > #endif /* _UAPI_BLKZONED_H */ > How to you retreive an extended descriptor that was set ? I do not see any code doing that. Report zones is not changed, but I would leave that one as is and implement a BLKGETDESCZONE ioctl & in-kernel API. -- Damien Le Moal Western Digital Research