Add zone device specific ioengine_ops for io_uring_cmd. * get_zoned_model * report_zones * reset_wp * get_max_open_zones Add the necessary NVMe ZNS specfication opcodes and structures. Add helper functions to submit admin and I/O passthrough commands for these new NVMe ZNS specific commands. For write workload iodepth must be set to 1 as there is no IO scheduler Tested-by: Vincent Fu <vincent.fu@xxxxxxxxxxx> Signed-off-by: Ankit Kumar <ankit.kumar@xxxxxxxxxxx> --- engines/io_uring.c | 32 ++++++ engines/nvme.c | 242 +++++++++++++++++++++++++++++++++++++++++++++ engines/nvme.h | 80 ++++++++++++++- 3 files changed, 353 insertions(+), 1 deletion(-) diff --git a/engines/io_uring.c b/engines/io_uring.c index a7b7b166..5a5406d4 100644 --- a/engines/io_uring.c +++ b/engines/io_uring.c @@ -1164,6 +1164,34 @@ static int fio_ioring_cmd_get_file_size(struct thread_data *td, return generic_get_file_size(td, f); } +static int fio_ioring_cmd_get_zoned_model(struct thread_data *td, + struct fio_file *f, + enum zbd_zoned_model *model) +{ + return fio_nvme_get_zoned_model(td, f, model); +} + +static int fio_ioring_cmd_report_zones(struct thread_data *td, + struct fio_file *f, uint64_t offset, + struct zbd_zone *zbdz, + unsigned int nr_zones) +{ + return fio_nvme_report_zones(td, f, offset, zbdz, nr_zones); +} + +static int fio_ioring_cmd_reset_wp(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) +{ + return fio_nvme_reset_wp(td, f, offset, length); +} + +static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td, + struct fio_file *f, + unsigned int *max_open_zones) +{ + return fio_nvme_get_max_open_zones(td, f, max_open_zones); +} + static struct ioengine_ops ioengine_uring = { .name = "io_uring", .version = FIO_IOOPS_VERSION, @@ -1200,6 +1228,10 @@ static struct ioengine_ops ioengine_uring_cmd = { .open_file = fio_ioring_cmd_open_file, .close_file = fio_ioring_cmd_close_file, .get_file_size = fio_ioring_cmd_get_file_size, + .get_zoned_model = fio_ioring_cmd_get_zoned_model, + .report_zones = fio_ioring_cmd_report_zones, + .reset_wp = fio_ioring_cmd_reset_wp, + .get_max_open_zones = fio_ioring_cmd_get_max_open_zones, .options = options, .option_struct_size = sizeof(struct ioring_options), }; diff --git a/engines/nvme.c b/engines/nvme.c index 6fecf0ba..59550def 100644 --- a/engines/nvme.c +++ b/engines/nvme.c @@ -101,3 +101,245 @@ int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz, close(fd); return 0; } + +int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f, + enum zbd_zoned_model *model) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + struct nvme_id_ns ns; + struct nvme_passthru_cmd cmd; + int fd, ret = 0; + + if (f->filetype != FIO_TYPE_CHAR) + return -EINVAL; + + /* File is not yet opened */ + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + /* Using nvme_id_ns for data as sizes are same */ + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL, + NVME_CSI_ZNS, &ns); + if (ret) { + *model = ZBD_NONE; + goto out; + } + + memset(&cmd, 0, sizeof(struct nvme_passthru_cmd)); + + /* Using nvme_id_ns for data as sizes are same */ + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS, + NVME_CSI_ZNS, &ns); + if (ret) { + *model = ZBD_NONE; + goto out; + } + + *model = ZBD_HOST_MANAGED; +out: + close(fd); + return 0; +} + +static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat, + __u32 data_len, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_zns_cmd_mgmt_recv, + .nsid = nsid, + .addr = (__u64)(uintptr_t)data, + .data_len = data_len, + .cdw10 = slba & 0xffffffff, + .cdw11 = slba >> 32, + .cdw12 = (data_len >> 2) - 1, + .cdw13 = NVME_ZNS_ZRA_REPORT_ZONES | zras_feat, + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, + }; + + return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd); +} + +int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f, + uint64_t offset, struct zbd_zone *zbdz, + unsigned int nr_zones) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + struct nvme_zone_report *zr; + struct nvme_zns_id_ns zns_ns; + struct nvme_id_ns ns; + unsigned int i = 0, j, zones_fetched = 0; + unsigned int max_zones, zones_chunks = 1024; + int fd, ret = 0; + __u32 zr_len; + __u64 zlen; + + /* File is not yet opened */ + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + zones_fetched = 0; + zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc)); + zr = calloc(1, zr_len); + if (!zr) + return -ENOMEM; + + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS, + NVME_CSI_NVM, &ns); + if (ret) { + log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name, + ret); + goto out; + } + + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS, + NVME_CSI_ZNS, &zns_ns); + if (ret) { + log_err("%s: nvme_zns_identify_ns failed, err=%d\n", + f->file_name, ret); + goto out; + } + zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift; + + max_zones = (f->real_file_size - offset) / zlen; + if (max_zones < nr_zones) + nr_zones = max_zones; + + if (nr_zones < zones_chunks) + zones_chunks = nr_zones; + + while (zones_fetched < nr_zones) { + if (zones_fetched + zones_chunks >= nr_zones) { + zones_chunks = nr_zones - zones_fetched; + zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc)); + } + ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift, + NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr); + if (ret) { + log_err("%s: nvme_zns_report_zones failed, err=%d\n", + f->file_name, ret); + goto out; + } + + /* Transform the zone-report */ + for (j = 0; j < zr->nr_zones; j++, i++) { + struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]); + + zbdz[i].start = desc->zslba << data->lba_shift; + zbdz[i].len = zlen; + zbdz[i].wp = desc->wp << data->lba_shift; + zbdz[i].capacity = desc->zcap << data->lba_shift; + + /* Zone Type is stored in first 4 bits. */ + switch (desc->zt & 0x0f) { + case NVME_ZONE_TYPE_SEQWRITE_REQ: + zbdz[i].type = ZBD_ZONE_TYPE_SWR; + break; + default: + log_err("%s: invalid type for zone at offset %llu.\n", + f->file_name, desc->zslba); + ret = -EIO; + goto out; + } + + /* Zone State is stored in last 4 bits. */ + switch (desc->zs >> 4) { + case NVME_ZNS_ZS_EMPTY: + zbdz[i].cond = ZBD_ZONE_COND_EMPTY; + break; + case NVME_ZNS_ZS_IMPL_OPEN: + zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN; + break; + case NVME_ZNS_ZS_EXPL_OPEN: + zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN; + break; + case NVME_ZNS_ZS_CLOSED: + zbdz[i].cond = ZBD_ZONE_COND_CLOSED; + break; + case NVME_ZNS_ZS_FULL: + zbdz[i].cond = ZBD_ZONE_COND_FULL; + break; + case NVME_ZNS_ZS_READ_ONLY: + case NVME_ZNS_ZS_OFFLINE: + default: + /* Treat all these conditions as offline (don't use!) */ + zbdz[i].cond = ZBD_ZONE_COND_OFFLINE; + zbdz[i].wp = zbdz[i].start; + } + } + zones_fetched += zr->nr_zones; + offset += zr->nr_zones * zlen; + } + + ret = zones_fetched; +out: + free(zr); + close(fd); + + return ret; +} + +int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + unsigned int nr_zones; + unsigned long long zslba; + int i, fd, ret = 0; + + /* If the file is not yet opened, open it for this function. */ + fd = f->fd; + if (fd < 0) { + fd = open(f->file_name, O_RDWR | O_LARGEFILE); + if (fd < 0) + return -errno; + } + + zslba = offset >> data->lba_shift; + nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size; + + for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) { + struct nvme_passthru_cmd cmd = { + .opcode = nvme_zns_cmd_mgmt_send, + .nsid = data->nsid, + .cdw10 = zslba & 0xffffffff, + .cdw11 = zslba >> 32, + .cdw13 = NVME_ZNS_ZSA_RESET, + .addr = (__u64)(uintptr_t)NULL, + .data_len = 0, + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, + }; + + ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd); + } + + if (f->fd < 0) + close(fd); + return -ret; +} + +int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + struct nvme_zns_id_ns zns_ns; + int fd, ret = 0; + + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS, + NVME_CSI_ZNS, &zns_ns); + if (ret) { + log_err("%s: nvme_zns_identify_ns failed, err=%d\n", + f->file_name, ret); + goto out; + } + + *max_open_zones = zns_ns.mor + 1; +out: + close(fd); + return ret; +} diff --git a/engines/nvme.h b/engines/nvme.h index 8e626bb2..70a89b74 100644 --- a/engines/nvme.h +++ b/engines/nvme.h @@ -43,8 +43,15 @@ struct nvme_uring_cmd { #define NVME_IDENTIFY_DATA_SIZE 4096 #define NVME_IDENTIFY_CSI_SHIFT 24 +#define NVME_ZNS_ZRA_REPORT_ZONES 0 +#define NVME_ZNS_ZRAS_FEAT_ERZ (1 << 16) +#define NVME_ZNS_ZSA_RESET 0x4 +#define NVME_ZONE_TYPE_SEQWRITE_REQ 0x2 + enum nvme_identify_cns { - NVME_IDENTIFY_CNS_NS = 0x00, + NVME_IDENTIFY_CNS_NS = 0x00, + NVME_IDENTIFY_CNS_CSI_NS = 0x05, + NVME_IDENTIFY_CNS_CSI_CTRL = 0x06, }; enum nvme_csi { @@ -60,6 +67,18 @@ enum nvme_admin_opcode { enum nvme_io_opcode { nvme_cmd_write = 0x01, nvme_cmd_read = 0x02, + nvme_zns_cmd_mgmt_send = 0x79, + nvme_zns_cmd_mgmt_recv = 0x7a, +}; + +enum nvme_zns_zs { + NVME_ZNS_ZS_EMPTY = 0x1, + NVME_ZNS_ZS_IMPL_OPEN = 0x2, + NVME_ZNS_ZS_EXPL_OPEN = 0x3, + NVME_ZNS_ZS_CLOSED = 0x4, + NVME_ZNS_ZS_READ_ONLY = 0xd, + NVME_ZNS_ZS_FULL = 0xe, + NVME_ZNS_ZS_OFFLINE = 0xf, }; struct nvme_data { @@ -127,10 +146,69 @@ static inline int ilog2(uint32_t i) return log; } +struct nvme_zns_lbafe { + __le64 zsze; + __u8 zdes; + __u8 rsvd9[7]; +}; + +struct nvme_zns_id_ns { + __le16 zoc; + __le16 ozcs; + __le32 mar; + __le32 mor; + __le32 rrl; + __le32 frl; + __le32 rrl1; + __le32 rrl2; + __le32 rrl3; + __le32 frl1; + __le32 frl2; + __le32 frl3; + __le32 numzrwa; + __le16 zrwafg; + __le16 zrwasz; + __u8 zrwacap; + __u8 rsvd53[2763]; + struct nvme_zns_lbafe lbafe[64]; + __u8 vs[256]; +}; + +struct nvme_zns_desc { + __u8 zt; + __u8 zs; + __u8 za; + __u8 zai; + __u8 rsvd4[4]; + __le64 zcap; + __le64 zslba; + __le64 wp; + __u8 rsvd32[32]; +}; + +struct nvme_zone_report { + __le64 nr_zones; + __u8 rsvd8[56]; + struct nvme_zns_desc entries[]; +}; + int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz, __u64 *nlba); int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u, struct iovec *iov); +int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f, + enum zbd_zoned_model *model); + +int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f, + uint64_t offset, struct zbd_zone *zbdz, + unsigned int nr_zones); + +int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f, + uint64_t offset, uint64_t length); + +int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f, + unsigned int *max_open_zones); + #endif -- 2.17.1