On 2022-03-09 05:04, Damien Le Moal wrote: > On 3/9/22 01:53, Pankaj Raghav wrote: > Contradiction: reducing the impact of performance regression is not the > same as "does not create a performance regression". So which one is it ? > Please add performance numbers to this commit message. > And also please actually explain what the patch is changing. This commit > message is about the why, but nothing on the how. > I will reword and add a bit more context to the commit log with perf numbers in the next revision >> +EXPORT_SYMBOL_GPL(nvme_fail_po2_zone_emu_violation); >> + > > This should go in zns.c, not in the core. > Ok. >> + >> +static void nvme_npo2_zone_setup(struct gendisk *disk) >> +{ >> + nvme_ns_po2_zone_emu_setup(disk->private_data); >> +} > > This helper seems useless. > I tried to retain the pattern with report_zones which is currently like this: static int nvme_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb, data); } But I can just remove this helper and use nvme_ns_po2_zone_emu_setup cb directly in nvme_bdev_fops. >> + >> /* >> * Convert a 512B sector number to a device logical block number. >> */ >> static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector) >> { >> - return sector >> (ns->lba_shift - SECTOR_SHIFT); >> +#ifdef CONFIG_BLK_DEV_ZONED >> + return ns->sect_to_lba(ns, sector); > > So for a power of 2 zone sized device, you are forcing an indirect call, > always. Not acceptable. What is the point of that po2_zone_emu boolean > you added to the queue ? This is a good point and we had a discussion about this internally. Initially I had something like this: if (!blk_queue_is_po2_zone_emu(disk)) return sector >> (ns->lba_shift - SECTOR_SHIFT); else return __nvme_sect_to_lba_po2(ns, sec); But @Luis indicated that it was better to set an op which comes at a cost of indirection instead of having a runtime check with a if/else in the **hot path**. The code also looks more clear with having an op. The performance analysis that we performed did not show any regression while using the indirection for po2 zone sizes, at least on the x86_64 architecture. So maybe we could use this opportunity to discuss which approach could be used here. >> + >> + sector = nvme_lba_to_sect(ns, >> + le64_to_cpu(nvme_req(req)->result.u64)); >> + >> + sector = ns->update_sector_append(ns, sector); > > Why not assign that to req->__sector directly ? > And again here, you are forcing the indirect function call for *all* zns > devices, even those that have a power of 2 zone size. > Same answer as above. I will adapt them based on the outcome of our discussions. >> + >> + req->__sector = sector; >> + } >> + >> +static inline bool nvme_po2_zone_emu_violation(struct request *req) >> +{ >> + return req->rq_flags & RQF_ZONE_EMU_VIOLATION; >> +} > > This helper makes the code unreadable in my opinion. > I will open code it then. >> +#else >> +static inline void nvme_end_req_zoned(struct request *req) >> +{ >> +} >> + >> +static inline void nvme_verify_sector_value(struct nvme_ns *ns, struct request *req) >> +{ >> +} >> + >> +static inline bool nvme_po2_zone_emu_violation(struct request *req) >> +{ >> + return false; >> +} >> + >> +#endif >> + >> /* >> * Convert byte length to nvme's 0-based num dwords >> */ >> @@ -752,6 +842,7 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, >> long nvme_dev_ioctl(struct file *file, unsigned int cmd, >> unsigned long arg); >> int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); >> +blk_status_t nvme_fail_po2_zone_emu_violation(struct request *req); >> >> extern const struct attribute_group *nvme_ns_id_attr_groups[]; >> extern const struct pr_ops nvme_pr_ops; >> @@ -873,11 +964,13 @@ static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) >> int nvme_revalidate_zones(struct nvme_ns *ns); >> int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, >> unsigned int nr_zones, report_zones_cb cb, void *data); >> +void nvme_ns_po2_zone_emu_setup(struct nvme_ns *ns); >> #ifdef CONFIG_BLK_DEV_ZONED >> int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); >> blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, >> struct nvme_command *cmnd, >> enum nvme_zone_mgmt_action action); >> +blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req); >> #else >> static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, >> struct request *req, struct nvme_command *cmnd, >> @@ -892,6 +985,11 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) >> "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); >> return -EPROTONOSUPPORT; >> } >> + >> +static inline blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req) >> +{ >> + return BLK_STS_OK; >> +} >> #endif >> >> static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) >> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c >> index 6a99ed680915..fc022df3f98e 100644 >> --- a/drivers/nvme/host/pci.c >> +++ b/drivers/nvme/host/pci.c >> @@ -960,6 +960,10 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, >> return nvme_fail_nonready_command(&dev->ctrl, req); >> >> ret = nvme_prep_rq(dev, req); >> + >> + if (unlikely(nvme_po2_zone_emu_violation(req))) >> + return nvme_fail_po2_zone_emu_violation(req); >> + >> if (unlikely(ret)) >> return ret; >> spin_lock(&nvmeq->sq_lock); >> diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c >> index ad02c61c0b52..25516a5ae7e2 100644 >> --- a/drivers/nvme/host/zns.c >> +++ b/drivers/nvme/host/zns.c >> @@ -3,7 +3,9 @@ >> * Copyright (C) 2020 Western Digital Corporation or its affiliates. >> */ >> >> +#include <linux/log2.h> >> #include <linux/blkdev.h> >> +#include <linux/math.h> >> #include <linux/vmalloc.h> >> #include "nvme.h" >> >> @@ -46,6 +48,18 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl) >> return 0; >> } >> >> +static sector_t nvme_zone_size(struct nvme_ns *ns) >> +{ >> + sector_t zone_size; >> + >> + if (blk_queue_is_po2_zone_emu(ns->queue)) >> + zone_size = ns->zsze_po2; >> + else >> + zone_size = ns->zsze; >> + >> + return zone_size; >> +} >> + >> int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) >> { >> struct nvme_effects_log *log = ns->head->effects; >> @@ -122,7 +136,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, >> sizeof(struct nvme_zone_descriptor); >> >> nr_zones = min_t(unsigned int, nr_zones, >> - get_capacity(ns->disk) / ns->zsze); >> + get_capacity(ns->disk) / nvme_zone_size(ns)); >> >> bufsize = sizeof(struct nvme_zone_report) + >> nr_zones * sizeof(struct nvme_zone_descriptor); >> @@ -147,6 +161,8 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns, >> void *data) >> { >> struct blk_zone zone = { }; >> + u64 zone_gap = 0; >> + u32 zone_idx; >> >> if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) { >> dev_err(ns->ctrl->device, "invalid zone type %#x\n", >> @@ -159,10 +175,19 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns, >> zone.len = ns->zsze; >> zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap)); >> zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba)); >> + >> + if (blk_queue_is_po2_zone_emu(ns->queue)) { >> + zone_idx = zone.start / zone.len; >> + zone_gap = zone_idx * ns->zsze_diff; >> + zone.start += zone_gap; >> + zone.len = ns->zsze_po2; >> + } >> + >> if (zone.cond == BLK_ZONE_COND_FULL) >> zone.wp = zone.start + zone.len; >> else >> - zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)); >> + zone.wp = >> + nvme_lba_to_sect(ns, le64_to_cpu(entry->wp)) + zone_gap; >> >> return cb(&zone, idx, data); >> } >> @@ -173,6 +198,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, >> struct nvme_zone_report *report; >> struct nvme_command c = { }; >> int ret, zone_idx = 0; >> + u64 zone_size = nvme_zone_size(ns); >> unsigned int nz, i; >> size_t buflen; >> >> @@ -190,7 +216,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, >> c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL; >> c.zmr.pr = NVME_REPORT_ZONE_PARTIAL; >> >> - sector &= ~(ns->zsze - 1); >> + sector = rounddown(sector, zone_size); >> while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) { >> memset(report, 0, buflen); >> >> @@ -214,7 +240,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, >> zone_idx++; >> } >> >> - sector += ns->zsze * nz; >> + sector += zone_size * nz; >> } >> >> if (zone_idx > 0) >> @@ -226,6 +252,32 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, >> return ret; >> } >> >> +void nvme_ns_po2_zone_emu_setup(struct nvme_ns *ns) >> +{ >> + u32 nr_zones; >> + sector_t capacity; >> + >> + if (is_power_of_2(ns->zsze)) >> + return; >> + >> + if (!get_capacity(ns->disk)) >> + return; >> + >> + blk_mq_freeze_queue(ns->disk->queue); >> + >> + blk_queue_po2_zone_emu(ns->queue, 1); >> + ns->zsze_po2 = 1 << get_count_order_long(ns->zsze); >> + ns->zsze_diff = ns->zsze_po2 - ns->zsze; >> + >> + nr_zones = get_capacity(ns->disk) / ns->zsze; >> + capacity = nr_zones * ns->zsze_po2; >> + set_capacity_and_notify(ns->disk, capacity); >> + ns->sect_to_lba = __nvme_sect_to_lba_po2; >> + ns->update_sector_append = nvme_update_sector_append_po2_zone_emu; >> + >> + blk_mq_unfreeze_queue(ns->disk->queue); >> +} >> + >> blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, >> struct nvme_command *c, enum nvme_zone_mgmt_action action) >> { >> @@ -239,5 +291,24 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, >> if (req_op(req) == REQ_OP_ZONE_RESET_ALL) >> c->zms.select_all = 1; >> >> + nvme_verify_sector_value(ns, req); >> + return BLK_STS_OK; >> +} >> + >> +blk_status_t nvme_zone_handle_po2_emu_violation(struct request *req) >> +{ >> + /* The spec mentions that read from ZCAP until ZSZE shall behave >> + * like a deallocated block. Deallocated block reads are >> + * deterministic, hence we fill zero. >> + * The spec does not clearly define the result for other opreation. >> + */ > > Comment style and indentation is weird. > Ack. >> + if (req_op(req) == REQ_OP_READ) { >> + zero_fill_bio(req->bio); >> + nvme_req(req)->status = NVME_SC_SUCCESS; >> + } else { >> + nvme_req(req)->status = NVME_SC_WRITE_FAULT; >> + } > > What about requests that straddle the zone capacity ? They need to be > partially zeroed too, otherwise data from the next zone may be exposed. > Good point. I will add this support in the next revision. Thanks. >> + blk_mq_set_request_complete(req); >> + nvme_complete_rq(req); >> return BLK_STS_OK; >> } >> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h >> index 3a41d50b85d3..9ec59183efcd 100644 >> --- a/include/linux/blk-mq.h >> +++ b/include/linux/blk-mq.h >> @@ -57,6 +57,8 @@ typedef __u32 __bitwise req_flags_t; >> #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) >> /* queue has elevator attached */ >> #define RQF_ELV ((__force req_flags_t)(1 << 22)) >> +/* request to do IO in the emulated area with po2 zone emulation */ >> +#define RQF_ZONE_EMU_VIOLATION ((__force req_flags_t)(1 << 23)) >> >> /* flags that prevent us from merging requests: */ >> #define RQF_NOMERGE_FLAGS \ > > -- Regards, Pankaj