On 11/30/20 21:44, Damien Le Moal wrote: > On 2020/12/01 12:38, Chaitanya Kulkarni wrote: > [...] >>>> + * namespaces. >>>> + */ >>>> + min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q)); >>>> + >>>> + xa_for_each(&(ns->subsys->namespaces), idx, ins) { >>>> + struct request_queue *iq = ins->bdev->bd_disk->queue; >>>> + unsigned int imax_za_sects = queue_max_zone_append_sectors(iq); >>>> + u8 izasl = nvmet_zasl(imax_za_sects); >>>> + >>>> + if (!bdev_is_zoned(ins->bdev)) >>>> + continue; >>>> + >>>> + min_zasl = min_zasl > izasl ? izasl : min_zasl; >>>> + } >>>> + >>>> + ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl); >>> I do not understand what bio_max_zasl is used for here, as it does not represent >>> anything related to the zoned namespaces backends. >> If we don't consider the bio max zasl then we will get the request more than >> one bio can handle for zone append and will lead to bio split whichneeds to >> be avoided. > Why ? zasl comes from the target backend max zone append sectors, which we > already know is OK and does not lead to BIO splitting for zone append. So why do > you need an extra verification against that bio_max_zasl ? > >>>> +}> + >>>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) >>>> +{ >>>> + if (ns->bdev->bd_disk->queue->conv_zones_bitmap) { >>>> + pr_err("block devices with conventional zones are not supported."); >>>> + return false; >>>> + } >>> It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that >>> all backend zone configuration checks are together. >> Okay. >>>> + >>>> + if (!nvmet_bdev_validate_zns_zones(ns)) >>>> + return false; >>>> + >>>> + /* >>>> + * For ZBC and ZAC devices, writes into sequential zones must be aligned >>>> + * to the device physical block size. So use this value as the logical >>>> + * block size to avoid errors. >>>> + */ >>>> + ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev)); >>>> + >>>> + nvmet_zns_update_zasl(ns); >>>> + >>>> + return true; >>>> +} >>>> + >>>> +/* >>>> + * ZNS related Admin and I/O command handlers. >>>> + */ >>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req) >>>> +{ >>>> + struct nvmet_ctrl *ctrl = req->sq->ctrl; >>>> + struct nvme_id_ctrl_zns *id; >>>> + u16 status = 0; >>>> + u8 mdts; >>>> + >>>> + id = kzalloc(sizeof(*id), GFP_KERNEL); >>>> + if (!id) { >>>> + status = NVME_SC_INTERNAL; >>>> + goto out; >>>> + } >>>> + >>>> + /* >>>> + * Even though this function sets Zone Append Size Limit to 0, >>>> + * the 0 value here indicates that the maximum data transfer size for >>>> + * the Zone Append command is indicated by the ctrl >>>> + * Maximum Data Transfer Size (MDTS). >>>> + */ >>> I do not understand this comment. It does not really exactly match what the code >>> below is doing. >> Let me fix the code and the comment in next version. >>>> + >>>> + mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0; >>>> + >>>> + id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl); >>>> + >>>> + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); >>>> + >>>> + kfree(id); >>>> +out: >>>> + nvmet_req_complete(req, status); >>>> +} >>>> + >>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) >>>> +{ >>>> + struct nvme_id_ns_zns *id_zns; >>>> + u16 status = 0; >>>> + u64 zsze; >>>> + >>>> + if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) { >>>> + req->error_loc = offsetof(struct nvme_identify, nsid); >>>> + status = NVME_SC_INVALID_NS | NVME_SC_DNR; >>>> + goto out; >>>> + } >>>> + >>>> + id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL); >>>> + if (!id_zns) { >>>> + status = NVME_SC_INTERNAL; >>>> + goto out; >>>> + } >>>> + >>>> + req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); >>>> + if (!req->ns) { >>>> + status = NVME_SC_INTERNAL; >>>> + goto done; >>>> + } >>>> + >>>> + if (!bdev_is_zoned(nvmet_bdev(req))) { >>>> + req->error_loc = offsetof(struct nvme_identify, nsid); >>>> + status = NVME_SC_INVALID_NS | NVME_SC_DNR; >>>> + goto done; >>>> + } >>>> + >>>> + nvmet_ns_revalidate(req->ns); >>>> + zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >> >>>> + req->ns->blksize_shift; >>>> + id_zns->lbafe[0].zsze = cpu_to_le64(zsze); >>>> + id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req))); >>>> + id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req))); >>>> + >>>> +done: >>>> + status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns)); >>>> + kfree(id_zns); >>>> +out: >>>> + nvmet_req_complete(req, status); >>>> +} >>>> + >>>> +struct nvmet_report_zone_data { >>>> + struct nvmet_ns *ns; >>>> + struct nvme_zone_report *rz; >>>> +}; >>>> + >>>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx, >>>> + void *data) >>>> +{ >>>> + struct nvmet_report_zone_data *report_zone_data = data; >>>> + struct nvme_zone_descriptor *entries = report_zone_data->rz->entries; >>>> + struct nvmet_ns *ns = report_zone_data->ns; >>>> + >>>> + entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity)); >>>> + entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start)); >>>> + entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp)); >>>> + entries[idx].za = z->reset ? 1 << 2 : 0; >>>> + entries[idx].zt = z->type; >>>> + entries[idx].zs = z->cond << 4; >>>> + >>>> + return 0; >>>> +} >>>> + >>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req) >>>> +{ >>>> + u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2; >>> size_t ? >> Yes. >>>> + struct nvmet_report_zone_data data = { .ns = req->ns }; >>>> + struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr; >>>> + sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba)); >>>> + unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1); >>> I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct >>> nvme_zone_report). I think what you want here is: >>> >>> nr_zones = (bufsize - sizeof(struct nvme_zone_report)) / >>> sizeof(struct nvme_zone_descriptor); >>> >>> And then you can get rid of nvmet_zones_to_desc_size(); >> Yes, it doesn't need nvmet_zones_to_desc_size() anymore from v1. >>>> + int reported_zones;Maybe there is better way. >>>> + u16 status; >>>> + >>>> + status = nvmet_bdev_zns_checks(req); >>>> + if (status) >>>> + goto out; >>>> + >>>> + data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); >>>> + if (!data.rz) { >>>> + status = NVME_SC_INTERNAL; >>>> + goto out; >>>> + } >>>> + >>>> + reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones, >>>> + nvmet_bdev_report_zone_cb, >>>> + &data); >>>> + if (reported_zones < 0) { >>>> + status = NVME_SC_INTERNAL; >>>> + goto out_free_report_zones; >>>> + } >>>> + >>>> + data.rz->nr_zones = cpu_to_le64(reported_zones); >>>> + >>>> + status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize); >>>> + >>>> +out_free_report_zones: >>>> + kvfree(data.rz); >>>> +out: >>>> + nvmet_req_complete(req, status); >>>> +} >>>> + >>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req) >>>> +{ >>>> + sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req)); >>>> + struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms; >>>> + enum req_opf op = REQ_OP_LAST; >>>> + u16 status = NVME_SC_SUCCESS; >>>> + sector_t sect; >>>> + int ret; >>>> + >>>> + sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba)); >>>> + >>>> + switch (c->zsa) { >>>> + case NVME_ZONE_OPEN: >>>> + op = REQ_OP_ZONE_OPEN; >>>> + break; >>>> + case NVME_ZONE_CLOSE: >>>> + op = REQ_OP_ZONE_CLOSE; >>>> + break; >>>> + case NVME_ZONE_FINISH: >>>> + op = REQ_OP_ZONE_FINISH; >>>> + break; >>>> + case NVME_ZONE_RESET: >>>> + if (c->select_all) >>>> + nr_sect = get_capacity(nvmet_bdev(req)->bd_disk); >>>> + op = REQ_OP_ZONE_RESET; >>>> + break; >>>> + default: >>>> + status = NVME_SC_INVALID_FIELD; >>>> + goto out; >>>> + } >>>> + >>>> + ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL); >>>> + if (ret) >>>> + status = NVME_SC_INTERNAL; >>>> + >>>> +out: >>>> + nvmet_req_complete(req, status); >>>> +} >>>> + >>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req) >>>> +{ >>>> + unsigned long bv_cnt = req->sg_cnt; >>>> + int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; >>>> + u64 slba = le64_to_cpu(req->cmd->rw.slba); >>>> + sector_t sect = nvmet_lba_to_sect(req->ns, slba); >>>> + u16 status = NVME_SC_SUCCESS; >>>> + size_t mapped_data_len = 0; >>>> + int sg_cnt = req->sg_cnt; >>>> + struct scatterlist *sg; >>>> + struct iov_iter from; >>>> + struct bio_vec *bvec; >>>> + size_t mapped_cnt; >>>> + struct bio *bio; >>>> + int ret; >>>> + >>>> + if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) >>>> + return; >>>> + >>>> + /* >>>> + * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we >>>> + * don't have to split the bio, i.e. we shouldn't get >>>> + * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os >>>> + * with the size that considers the BIO_MAX_PAGES. >>>> + */ >>> What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max >>> zone append sectors). What does BIO_MAX_PAGES has to do with anything ? >> That is not true, ctrl->zasl is advertised >> min(min all ns max zone append sectors), bio_max_zasl) to avoid the >> splitting >> of the bio on the target side:- > We already know that zasl for each NS, given by the max zone append sector of > the backends, are already OK, regardless of BIO_MAX_PAGES (see below). > >> See nvmet_zns_update_zasl() >> >> + ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl); >> >> Without considering the bio_max_pages we may have to set the ctrl->zasl >> value >> thatis > bio_max_pages_zasl, then we'll get a request that is greater >> than what >> one bio can handle, that will lead to splitting the bios, which we want to >> avoid as per the comment in the V1. >> >> Can you please explain what is wrong with this approach which regulates the >> zasl with all the possible namespaces zasl and bio_zasl? > For starter, with multi-page bvecs now, I am not sure BIO_MAX_PAGES makes any > sense anymore. Next, on the target side, the max zone append sectors limit for > each NS guarantee that a single BIO can be used without splitting needed. Taking > the min of all of them will NOT remove that guarantee. Hence I think that > BIO_MAX_PAGES has nothing to do in the middle of all this. > Okay, let me remove the bio size check and send v4. >> May be there is better way? >>>> + if (!req->sg_cnt) >>>> + goto out; >>>> + >>>> + if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) { >>>> + status = NVME_SC_INTERNAL | NVME_SC_DNR; >>>> + goto out; >>>> + } >>>> + >>>> + bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL); >>>> + if (!bvec) { >>>> + status = NVME_SC_INTERNAL | NVME_SC_DNR; >>>> + goto out; >>>> + } >>>> + >>>> + for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) { >>>> + nvmet_file_init_bvec(bvec, sg); >>>> + mapped_data_len += bvec[mapped_cnt].bv_len; >>>> + sg_cnt--; >>>> + if (mapped_cnt == bv_cnt) >>>> + brhigh frequency I/O operationeak; >>>> + } >>>> + >>>> + if (WARN_ON(sg_cnt)) { >>>> + status = NVME_SC_INTERNAL | NVME_SC_DNR; >>>> + goto out; >>>> + } >>>> + >>>> + iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len); >>>> + >>>> + bio = bio_alloc(GFP_KERNEL, bv_cnt); >>>> + bio_set_dev(bio, nvmet_bdev(req)); >>>> + bio->bi_iter.bi_sector = sect; >>>> + bio->bi_opf = op; >>> The op variable is used only here. It is not necessary. >> Yes. >>>> + >>>> + ret = __bio_iov_append_get_pages(bio, &from); >>> I still do not see why bio_iov_iter_get_pages() does not work here. Would you >>> care to explain please ? >>> >> The same reason pointed out by the Johannes. Instead of calling wrapper, >> call the underlaying core API, since we don't care about the rest of the >> generic code. This also avoids an extra function callfor zone-append >> fast path. > I am still not convinced that __bio_iov_append_get_pages() will do the right > thing as it lacks the loop that bio_iov_iter_get_pages() has. > >>>> + if (unlikely(ret)) { >>>> + status = NVME_SC_INTERNAL | NVME_SC_DNR; >>>> + bio_io_error(bio); >>>> + goto bvec_free; >>>> + } >>>> + >>>> + ret = submit_bio_wait(bio); >>>> + status = ret < 0 ? NVME_SC_INTERNAL : status; >>>> + bio_put(bio); >>>> + >>>> + sect += (mapped_data_len >> 9); >>>> + req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect)); >>>> + >>>> +bvec_free: >>>> + kfree(bvec); >>>> + >>>> +out: >>>> + nvmet_req_complete(req, status); >>>> +} >>>> + >>>> +#else /* CONFIG_BLK_DEV_ZONED */ >>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req) >>>> +{ >>>> +} >>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req) >>>> +{ >>>> +} >>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off) >>>> +{ >>>> + return 0; >>>> +} >>>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns) >>>> +{ >>>> + return false; >>>> +} >>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req) >>>> +{ >>>> +} >>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req) >>>> +{ >>>> +} >>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req) >>>> +{ >>>> +} >>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log) >>>> +{ >>>> +} >>>> +#endif /* CONFIG_BLK_DEV_ZONED */ >>>> >> >