On 2020/12/10 15:27, Chaitanya Kulkarni wrote: > NVMe TP 4053 – Zoned Namespaces (ZNS) allows host software to > communicate with a non-volatile memory subsystem using zones for > NVMe protocol based controllers. NVMeOF already support the ZNS NVMe > Protocol compliant devices on the target in the passthru mode. There > are Generic Zoned Block Devices like Shingled Magnetic Recording (SMR) Please remove "Generic" from this sentence. It is confusing. Also, I do not think you need to capitalize ZOned Block Devices. That is something Linux defines. It is not defined by any standard. > HDD which are not based on the NVMe protocol. > > This patch adds ZNS backend to support the ZBDs for NVMeOF target. > > This support inculdes implementing the new command set NVME_CSI_ZNS, s/inculdes/includes > adding different command handlers for ZNS command set such as > NVMe Identify Controller, NVMe Identify Namespace, NVMe Zone Append, > NVMe Zone Management Send and NVMe Zone Management Receive. > > With new command set identifier we also update the target command effects > logs to reflect the ZNS compliant commands. > > Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@xxxxxxx> > --- > drivers/nvme/target/Makefile | 1 + > drivers/nvme/target/admin-cmd.c | 26 +++ > drivers/nvme/target/core.c | 1 + > drivers/nvme/target/io-cmd-bdev.c | 33 ++- > drivers/nvme/target/nvmet.h | 38 ++++ > drivers/nvme/target/zns.c | 327 ++++++++++++++++++++++++++++++ > 6 files changed, 418 insertions(+), 8 deletions(-) > create mode 100644 drivers/nvme/target/zns.c > > diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile > index ebf91fc4c72e..9837e580fa7e 100644 > --- a/drivers/nvme/target/Makefile > +++ b/drivers/nvme/target/Makefile > @@ -12,6 +12,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o > nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ > discovery.o io-cmd-file.o io-cmd-bdev.o > nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o > +nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o > nvme-loop-y += loop.o > nvmet-rdma-y += rdma.o > nvmet-fc-y += fc.o > diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c > index f4c0f3aca485..6f5279b15aa6 100644 > --- a/drivers/nvme/target/admin-cmd.c > +++ b/drivers/nvme/target/admin-cmd.c > @@ -192,6 +192,15 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) > log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0); > log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0); > break; > + case NVME_CSI_ZNS: > + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { > + u32 *iocs = log->iocs; > + > + iocs[nvme_cmd_zone_append] = cpu_to_le32(1 << 0); > + iocs[nvme_cmd_zone_mgmt_send] = cpu_to_le32(1 << 0); > + iocs[nvme_cmd_zone_mgmt_recv] = cpu_to_le32(1 << 0); > + } > + break; > default: > status = NVME_SC_INVALID_LOG_PAGE; > break; > @@ -614,6 +623,7 @@ static u16 nvmet_copy_ns_identifier(struct nvmet_req *req, u8 type, u8 len, > > static void nvmet_execute_identify_desclist(struct nvmet_req *req) > { > + u16 nvme_cis_zns = NVME_CSI_ZNS; > u16 status = 0; > off_t off = 0; > > @@ -638,6 +648,14 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req) > if (status) > goto out; > } > + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { > + if (req->ns->csi == NVME_CSI_ZNS) > + status = nvmet_copy_ns_identifier(req, NVME_NIDT_CSI, > + NVME_NIDT_CSI_LEN, > + &nvme_cis_zns, &off); > + if (status) > + goto out; > + } > > if (sg_zero_buffer(req->sg, req->sg_cnt, NVME_IDENTIFY_DATA_SIZE - off, > off) != NVME_IDENTIFY_DATA_SIZE - off) > @@ -655,8 +673,16 @@ static void nvmet_execute_identify(struct nvmet_req *req) > switch (req->cmd->identify.cns) { > case NVME_ID_CNS_NS: > return nvmet_execute_identify_ns(req); > + case NVME_ID_CNS_CS_NS: > + if (req->cmd->identify.csi == NVME_CSI_ZNS) > + return nvmet_execute_identify_cns_cs_ns(req); > + break; > case NVME_ID_CNS_CTRL: > return nvmet_execute_identify_ctrl(req); > + case NVME_ID_CNS_CS_CTRL: > + if (req->cmd->identify.csi == NVME_CSI_ZNS) > + return nvmet_execute_identify_cns_cs_ctrl(req); > + break; > case NVME_ID_CNS_NS_ACTIVE_LIST: > return nvmet_execute_identify_nslist(req); > case NVME_ID_CNS_NS_DESC_LIST: > diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c > index 672e4009f8d6..17a99c7134dc 100644 > --- a/drivers/nvme/target/core.c > +++ b/drivers/nvme/target/core.c > @@ -1107,6 +1107,7 @@ static inline u8 nvmet_cc_iocqes(u32 cc) > static inline bool nvmet_cc_css_check(u8 cc_css) > { > switch (cc_css <<= NVME_CC_CSS_SHIFT) { > + case NVME_CC_CSS_CSI: > case NVME_CC_CSS_NVM: > return true; > default: > diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c > index 23095bdfce06..6178ef643962 100644 > --- a/drivers/nvme/target/io-cmd-bdev.c > +++ b/drivers/nvme/target/io-cmd-bdev.c > @@ -63,6 +63,14 @@ static void nvmet_bdev_ns_enable_integrity(struct nvmet_ns *ns) > } > } > > +void nvmet_bdev_ns_disable(struct nvmet_ns *ns) > +{ > + if (ns->bdev) { > + blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ); > + ns->bdev = NULL; > + } > +} > + > int nvmet_bdev_ns_enable(struct nvmet_ns *ns) > { > int ret; > @@ -86,15 +94,15 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) > if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY_T10)) > nvmet_bdev_ns_enable_integrity(ns); > > - return 0; > -} > - > -void nvmet_bdev_ns_disable(struct nvmet_ns *ns) > -{ > - if (ns->bdev) { > - blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ); > - ns->bdev = NULL; > + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && bdev_is_zoned(ns->bdev)) { > + if (!nvmet_bdev_zns_enable(ns)) { > + nvmet_bdev_ns_disable(ns); > + return -EINVAL; > + } > + ns->csi = NVME_CSI_ZNS; > } > + > + return 0; > } > > void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns) > @@ -448,6 +456,15 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) > case nvme_cmd_write_zeroes: > req->execute = nvmet_bdev_execute_write_zeroes; > return 0; > + case nvme_cmd_zone_append: > + req->execute = nvmet_bdev_execute_zone_append; > + return 0; > + case nvme_cmd_zone_mgmt_recv: > + req->execute = nvmet_bdev_execute_zone_mgmt_recv; > + return 0; > + case nvme_cmd_zone_mgmt_send: > + req->execute = nvmet_bdev_execute_zone_mgmt_send; > + return 0; > default: > pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, > req->sq->qid); > diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h > index 0360594abd93..dae6ecba6780 100644 > --- a/drivers/nvme/target/nvmet.h > +++ b/drivers/nvme/target/nvmet.h > @@ -252,6 +252,10 @@ struct nvmet_subsys { > unsigned int admin_timeout; > unsigned int io_timeout; > #endif /* CONFIG_NVME_TARGET_PASSTHRU */ > + > +#ifdef CONFIG_BLK_DEV_ZONED > + u8 zasl; > +#endif /* CONFIG_BLK_DEV_ZONED */ > }; > > static inline struct nvmet_subsys *to_subsys(struct config_item *item) > @@ -614,4 +618,38 @@ static inline sector_t nvmet_lba_to_sect(struct nvmet_ns *ns, __le64 lba) > return le64_to_cpu(lba) << (ns->blksize_shift - SECTOR_SHIFT); > } > > +#ifdef CONFIG_BLK_DEV_ZONED > +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns); > +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req); > +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req); > +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req); > +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req); > +void nvmet_bdev_execute_zone_append(struct nvmet_req *req); > +#else /* CONFIG_BLK_DEV_ZONED */ > +static inline bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) > +{ > + return false; > +} > +static inline void > +nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req) > +{ > +} > +static inline void > +nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) > +{ > +} > +static inline void > +nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req) > +{ > +} > +static inline void > +nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req) > +{ > +} > +static inline void > +nvmet_bdev_execute_zone_append(struct nvmet_req *req) > +{ > +} > +#endif /* CONFIG_BLK_DEV_ZONED */ > + > #endif /* _NVMET_H */ > diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c > new file mode 100644 > index 000000000000..ae51bae996f9 > --- /dev/null > +++ b/drivers/nvme/target/zns.c > @@ -0,0 +1,327 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * NVMe ZNS-ZBD command implementation. > + * Copyright (c) 2020-2021 HGST, a Western Digital Company. > + */ > +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt > +#include <linux/uio.h> > +#include <linux/nvme.h> > +#include <linux/xarray.h> > +#include <linux/blkdev.h> > +#include <linux/module.h> > +#include "nvmet.h" > + > +/* > + * We set the Memory Page Size Minimum (MPSMIN) for target controller to 0 > + * which gets added by 12 in the nvme_enable_ctrl() which results in 2^12 = 4k > + * as page_shift value. When calculating the ZASL use shift by 12. > + */ > +#define NVMET_MPSMIN_SHIFT 12 > + > +static u16 nvmet_bdev_zns_checks(struct nvmet_req *req) > +{ > + u16 status = 0; > + > + if (!bdev_is_zoned(req->ns->bdev)) { > + status = NVME_SC_INVALID_NS | NVME_SC_DNR; > + goto out; > + } > + > + if (req->cmd->zmr.zra != NVME_ZRA_ZONE_REPORT) { > + status = NVME_SC_INVALID_FIELD; > + goto out; > + } > + > + if (req->cmd->zmr.zrasf != NVME_ZRASF_ZONE_REPORT_ALL) { > + status = NVME_SC_INVALID_FIELD; > + goto out; > + } > + > + if (req->cmd->zmr.pr != NVME_REPORT_ZONE_PARTIAL) > + status = NVME_SC_INVALID_FIELD; > + > +out: > + return status; > +} I really fail to see how the gotos here help with the code "being better". Are you absolutely sure you want to keep this super convoluted style for a function that is that simple ? > + > +/* > + * ZNS related command implementation and helpers. > + */ > + > +static inline u8 nvmet_zasl(unsigned int zone_append_sects) > +{ > + /* > + * Zone Append Size Limit is the value experessed in the units > + * of minimum memory page size (i.e. 12) and is reported power of 2. > + */ > + return ilog2((zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT); > +} > + > +static inline bool nvmet_zns_update_zasl(struct nvmet_ns *ns) > +{ > + struct request_queue *q = ns->bdev->bd_disk->queue; > + u8 zasl = nvmet_zasl(queue_max_zone_append_sectors(q)); > + > + if (ns->subsys->zasl) > + return ns->subsys->zasl < zasl ? false : true; > + > + ns->subsys->zasl = zasl; > + return true; > +} > + > +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) > +{ > + if (ns->bdev->bd_disk->queue->conv_zones_bitmap) { Hmm... BIO based DM devices do not have this bitmap set since they do not have a scheduler. So if one setup a dm-linear device on top of an SMR disk and export the DM device through fabric, then this check will fail to verify if conventional zones are present. There may be no other option than to do a full report zones here if queue->seq_zones_wlock is NULL (meaning the queue is for a stacked device). > + pr_err("block devices with conventional zones are not supported."); > + return false; > + } > + > + /* > + * For ZBC and ZAC devices, writes into sequential zones must be aligned > + * to the device physical block size. So use this value as the logical > + * block size to avoid errors. > + */ > + ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev)); > + > + if (!nvmet_zns_update_zasl(ns)) > + return false; > + > + return !(get_capacity(ns->bdev->bd_disk) & > + (bdev_zone_sectors(ns->bdev) - 1)); > +} > + > +/* > + * ZNS related Admin and I/O command handlers. > + */ > +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req) > +{ > + u8 zasl = req->sq->ctrl->subsys->zasl; > + struct nvmet_ctrl *ctrl = req->sq->ctrl; > + struct nvme_id_ctrl_zns *id; > + u16 status; > + > + id = kzalloc(sizeof(*id), GFP_KERNEL); > + if (!id) { > + status = NVME_SC_INTERNAL; > + goto out; > + } > + > + if (ctrl->ops->get_mdts) > + id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl); > + else > + id->zasl = zasl; > + > + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); > + > + kfree(id); > +out: > + nvmet_req_complete(req, status); > +} > + > +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) > +{ > + struct nvme_id_ns_zns *id_zns; > + u16 status = 0; > + u64 zsze; > + > + if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { > + req->error_loc = offsetof(struct nvme_identify, nsid); > + status = NVME_SC_INVALID_NS | NVME_SC_DNR; > + goto out; > + } > + > + id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL); > + if (!id_zns) { > + status = NVME_SC_INTERNAL; > + goto out; > + } > + > + req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); > + if (!req->ns) { > + status = NVME_SC_INTERNAL; > + goto done; > + } > + > + if (!bdev_is_zoned(req->ns->bdev)) { > + req->error_loc = offsetof(struct nvme_identify, nsid); > + status = NVME_SC_INVALID_NS | NVME_SC_DNR; > + goto done; > + } > + > + nvmet_ns_revalidate(req->ns); > + zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >> > + req->ns->blksize_shift; > + id_zns->lbafe[0].zsze = cpu_to_le64(zsze); > + id_zns->mor = cpu_to_le32(bdev_max_open_zones(req->ns->bdev)); > + id_zns->mar = cpu_to_le32(bdev_max_active_zones(req->ns->bdev)); > + > +done: > + status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns)); > + kfree(id_zns); > +out: > + nvmet_req_complete(req, status); > +} > + > +struct nvmet_report_zone_data { > + struct nvmet_ns *ns; > + struct nvme_zone_report *rz; > +}; > + > +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx, > + void *data) > +{ > + struct nvmet_report_zone_data *report_zone_data = data; > + struct nvme_zone_descriptor *entries = report_zone_data->rz->entries; > + struct nvmet_ns *ns = report_zone_data->ns; > + > + entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity)); > + entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start)); > + entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp)); > + entries[idx].za = z->reset ? 1 << 2 : 0; > + entries[idx].zt = z->type; > + entries[idx].zs = z->cond << 4; > + > + return 0; > +} > + > +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req) > +{ > + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); > + u64 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2; > + struct nvmet_report_zone_data data = { .ns = req->ns }; > + unsigned int nr_zones; > + int reported_zones; > + u16 status; > + > + nr_zones = (bufsize - sizeof(struct nvme_zone_report)) / > + sizeof(struct nvme_zone_descriptor); This is a 64 bits division. This will not compile on 32-bits arch, no ? I.e., you must use do_div64() here I think. Or just a bit shift since struct nvme_zone_descriptor is exactly 64B. Or have bufsize be a 32 bits. There is no need for that variable to be 64 bits. > + > + status = nvmet_bdev_zns_checks(req); > + if (status) > + goto out; > + > + data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); > + if (!data.rz) { > + status = NVME_SC_INTERNAL; > + goto out; > + } > + > + reported_zones = blkdev_report_zones(req->ns->bdev, sect, nr_zones, > + nvmet_bdev_report_zone_cb, > + &data); > + if (reported_zones < 0) { > + status = NVME_SC_INTERNAL; > + goto out_free_report_zones; > + } > + > + data.rz->nr_zones = cpu_to_le64(reported_zones); > + > + status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize); reported_zoned may be far less than the request nr_zones. So you may want to recalculate bufsize before doing this to not copy garbage data into the reply buffer. Another thing that needs verification is: don't you need to zero-oput the data.rz buffer with GFP_ZERO, for security ? > + > +out_free_report_zones: > + kvfree(data.rz); > +out: > + nvmet_req_complete(req, status); > +} > + > +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req) > +{ > + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba); > + sector_t nr_sect = bdev_zone_sectors(req->ns->bdev); > + enum req_opf op = REQ_OP_LAST; > + u16 status = NVME_SC_SUCCESS; > + int ret; > + > + if (req->cmd->zms.select_all) > + nr_sect = get_capacity(req->ns->bdev->bd_disk); > + > + switch (req->cmd->zms.zsa) { > + case NVME_ZONE_OPEN: > + op = REQ_OP_ZONE_OPEN; > + break; > + case NVME_ZONE_CLOSE: > + op = REQ_OP_ZONE_CLOSE; > + break; > + case NVME_ZONE_FINISH: > + op = REQ_OP_ZONE_FINISH; > + break; > + case NVME_ZONE_RESET: > + op = REQ_OP_ZONE_RESET; > + break; > + default: > + status = NVME_SC_INVALID_FIELD; > + goto out; > + } > + > + ret = blkdev_zone_mgmt(req->ns->bdev, op, sect, nr_sect, GFP_KERNEL); > + if (ret) > + status = NVME_SC_INTERNAL; > +out: > + nvmet_req_complete(req, status); > +} > + > +void nvmet_bdev_execute_zone_append(struct nvmet_req *req) > +{ > + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); > + struct request_queue *q = req->ns->bdev->bd_disk->queue; > + unsigned int max_sects = queue_max_zone_append_sectors(q); > + u16 status = NVME_SC_SUCCESS; > + unsigned int total_len = 0; > + struct scatterlist *sg; > + int ret = 0, sg_cnt; > + struct bio *bio; > + > + if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) > + return; > + > + if (!req->sg_cnt) { > + nvmet_req_complete(req, 0); > + return; > + } > + > + if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { > + bio = &req->b.inline_bio; > + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); > + } else { > + bio = bio_alloc(GFP_KERNEL, req->sg_cnt); > + } > + > + bio_set_dev(bio, req->ns->bdev); > + bio->bi_iter.bi_sector = sect; > + bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; > + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) > + bio->bi_opf |= REQ_FUA; > + > + for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) { > + struct page *p = sg_page(sg); > + unsigned int l = sg->length; > + unsigned int o = sg->offset; > + bool same_page = false; > + > + ret = bio_add_hw_page(q, bio, p, l, o, max_sects, &same_page); > + if (ret != sg->length) { > + status = NVME_SC_INTERNAL; > + goto out_bio_put; > + } > + if (same_page) > + put_page(p); > + > + total_len += sg->length; > + } > + > + if (total_len != nvmet_rw_data_len(req)) { > + status = NVME_SC_INTERNAL | NVME_SC_DNR; > + goto out_bio_put; > + } > + > + ret = submit_bio_wait(bio); > + status = ret < 0 ? NVME_SC_INTERNAL : status; > + > + sect += (total_len >> 9); This is incorrect: sect is the start sector of the append operation target zone. Adding to it the request size does not give the actual sector written on the backend. You need to get that from the completed BIO. And do not add the request size. The result is the first written sector, not the sector following the last one written. > + req->cqe->result.u64 = cpu_to_le64(nvmet_sect_to_lba(req->ns, sect)); > + > +out_bio_put: > + if (bio != &req->b.inline_bio) > + bio_put(bio); > + nvmet_req_complete(req, status); > +} > -- Damien Le Moal Western Digital Research