On Sat, 2020-12-12 at 21:50 -0800, Chaitanya Kulkarni wrote: [...] > +static int nvmet_bdev_validate_zns_zones_cb(struct blk_zone *z, > + unsigned int idx, void *data) > +{ > + struct blk_zone *zone = data; > + > + memcpy(zone, z, sizeof(struct blk_zone)); See below. This is not necessary. > + > + return 0; > +} > + > +static bool nvmet_bdev_has_conv_zones(struct block_device *bdev) > +{ > + struct blk_zone zone; > + int reported_zones; > + unsigned int zno; > + > + if (bdev->bd_disk->queue->conv_zones_bitmap) > + return false; Bug. > + > + for (zno = 0; zno < blk_queue_nr_zones(bdev->bd_disk->queue); zno++) { Large capacity SMR drives have over 75,000 zones these days. Doing a report zones one zone at a time will take forever. This needs to be optimized: see below. > + reported_zones = blkdev_report_zones(bdev, > + zno * bdev_zone_sectors(bdev), 1, > + nvmet_bdev_validate_zns_zones_cb, > + &zone); > + > + if (reported_zones != 1) > + return true; > + > + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) > + return true; This test should be in the nvmet_bdev_validate_zns_zones_cb() callback. That callback can return an error if it finds a conventional zone. That will stop blkdev_report_zones(). > + } > + > + return false; > +} What about this: static int nvmet_bdev_validate_zns_zones_cb(struct blk_zone *z, unsigned int idx, void *data) { if (z->type == BLK_ZONE_TYPE_CONVENTIONAL) return -ENOTSUPP; return 0; } static bool nvmet_bdev_has_conv_zones(struct block_device *bdev) { int ret; if (bdev->bd_disk->queue->conv_zones_bitmap) return true; ret = blkdev_report_zones(bdev, get_capacity(bdev->bd_disk), bdev_nr_zones(bdev), nvmet_bdev_validate_zns_zones_cb, NULL); if (ret < 1) return true; return false; } All zones are checked using the callback with the loop in blkdev_report_zones(). [...] > +void nvmet_bdev_execute_zone_append(struct nvmet_req *req) > +{ > + sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba); > + struct request_queue *q = req->ns->bdev->bd_disk->queue; > + unsigned int max_sects = queue_max_zone_append_sectors(q); > + u16 status = NVME_SC_SUCCESS; > + unsigned int total_len = 0; > + struct scatterlist *sg; > + int ret = 0, sg_cnt; > + struct bio *bio; > + > + if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req))) > + return; > + > + if (!req->sg_cnt) { > + nvmet_req_complete(req, 0); > + return; > + } > + > + if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) { > + bio = &req->b.inline_bio; > + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); > + } else { > + bio = bio_alloc(GFP_KERNEL, req->sg_cnt); > + } > + > + bio_set_dev(bio, req->ns->bdev); > + bio->bi_iter.bi_sector = sect; > + bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; > + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) > + bio->bi_opf |= REQ_FUA; > + > + for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) { > + struct page *p = sg_page(sg); > + unsigned int l = sg->length; > + unsigned int o = sg->offset; > + bool same_page = false; > + > + ret = bio_add_hw_page(q, bio, p, l, o, max_sects, &same_page); > + if (ret != sg->length) { > + status = NVME_SC_INTERNAL; > + goto out_bio_put; > + } > + if (same_page) > + put_page(p); > + > + total_len += sg->length; > + } > + > + if (total_len != nvmet_rw_data_len(req)) { > + status = NVME_SC_INTERNAL | NVME_SC_DNR; > + goto out_bio_put; > + } > + > + ret = submit_bio_wait(bio); > + status = ret < 0 ? NVME_SC_INTERNAL : status; > + > + req->cqe->result.u64 = nvmet_sect_to_lba(req->ns, > + bio->bi_iter.bi_sector); Why set this if the BIO failed ? There may be no problems doing so, but I do not see the point either. > + > +out_bio_put: > + if (bio != &req->b.inline_bio) > + bio_put(bio); > + nvmet_req_complete(req, status); > +} -- Damien Le Moal Western Digital