From: Keith Busch <kbusch@xxxxxxxxxx> NVMe TP4146 creates a new feature called Flexible Data Placement. This feature allows a host to tell the device how to group write data through the use of "Placement Identifiers" in write commands. Add support for using placement identifiers in write commands. The user can enabled this with the new "fdp=1" parameter for fio's io_uring_cmd ioengine. By default, the fio jobs will cycle through all the namespace's available placement identifiers for write commands. The user can limit which placement identifiers can be used with additional parameter, "fdp_plis=<list,>", which can be used to separate write intensive jobs from less intenstive ones. Setting up your namespace for FDP is outside the scope of 'fio', so this assumes the namespace is already properly configured for the mode. Based-on-a-patch-by: Ankit Kumar <ankit.kumar@xxxxxxxxxxx> Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> --- Makefile | 2 +- engines/io_uring.c | 31 ++++++++ engines/nvme.c | 88 +++++++++++++++++++++- engines/nvme.h | 148 +++++++++++++++++++++++++++++++++++++ examples/uring-cmd-fdp.fio | 37 ++++++++++ file.h | 3 + filesetup.c | 7 ++ io_u.c | 3 + io_u.h | 3 + ioengines.h | 4 + options.c | 49 ++++++++++++ thread_options.h | 8 ++ 12 files changed, 381 insertions(+), 2 deletions(-) create mode 100644 examples/uring-cmd-fdp.fio diff --git a/Makefile b/Makefile index 5f4e6562..89205ebf 100644 --- a/Makefile +++ b/Makefile @@ -62,7 +62,7 @@ SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \ gettime-thread.c helpers.c json.c idletime.c td_error.c \ profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \ workqueue.c rate-submit.c optgroup.c helper_thread.c \ - steadystate.c zone-dist.c zbd.c dedupe.c + steadystate.c zone-dist.c zbd.c dedupe.c fdp.c ifdef CONFIG_LIBHDFS HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE) diff --git a/engines/io_uring.c b/engines/io_uring.c index a9abd11d..644b279f 100644 --- a/engines/io_uring.c +++ b/engines/io_uring.c @@ -1262,6 +1262,34 @@ static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td, return fio_nvme_get_max_open_zones(td, f, max_open_zones); } +static int fio_ioring_cmd_fdp_support(struct thread_data *td, struct fio_file *f, + bool *support) +{ + return fio_nvme_is_fdp(td, f, support); +} + +static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f, + struct fio_ruhs_info *fruhs_info) +{ + struct nvme_fdp_ruh_status *ruhs; + int bytes, ret, i; + + bytes = sizeof(*ruhs) + 1024 * sizeof(struct nvme_fdp_ruh_status_desc); + ruhs = malloc(bytes); + memset(ruhs, 0, bytes); + + ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes); + if (ret) + goto free; + + fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd); + for (i = 0; i < fruhs_info->nr_ruhs; i++) + fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid); +free: + free(ruhs); + return ret; +} + static struct ioengine_ops ioengine_uring = { .name = "io_uring", .version = FIO_IOOPS_VERSION, @@ -1307,6 +1335,9 @@ static struct ioengine_ops ioengine_uring_cmd = { .get_max_open_zones = fio_ioring_cmd_get_max_open_zones, .options = options, .option_struct_size = sizeof(struct ioring_options), + + .fdp_support = fio_ioring_cmd_fdp_support, + .fetch_ruhs = fio_ioring_cmd_fetch_ruhs, }; static void fio_init fio_ioring_register(void) diff --git a/engines/nvme.c b/engines/nvme.c index 9ffc5303..af4be733 100644 --- a/engines/nvme.c +++ b/engines/nvme.c @@ -28,7 +28,8 @@ int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u, cmd->cdw10 = slba & 0xffffffff; cmd->cdw11 = slba >> 32; /* cdw12 represent number of lba's for read/write */ - cmd->cdw12 = nlb; + cmd->cdw12 = nlb | (io_u->dtype << 20); + cmd->cdw13 = io_u->dspec << 16; if (iov) { iov->iov_base = io_u->xfer_buf; iov->iov_len = io_u->xfer_buflen; @@ -345,3 +346,88 @@ out: close(fd); return ret; } + +static inline int nvme_dir_id(int fd, __u32 nsid, struct nvme_id_directives *data) +{ + __u32 data_len = sizeof(*data); + + struct nvme_passthru_cmd cmd = { + .opcode = nvme_admin_directive_recv, + .nsid = nsid, + .cdw10 = (data_len >> 2) - 1, + .cdw11 = 1, + .data_len = data_len, + .addr = (__u64)(uintptr_t)data, + }; + + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); +} + +int fio_nvme_is_fdp(struct thread_data *td, struct fio_file *f, bool *fdp) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + struct nvme_id_directives dir; + struct nvme_id_ctrl id; + bool is_fdp = false; + int fd, ret = 0; + + *fdp = false; + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CTRL, 0, &id); + if (ret) { + log_err("%s: nvme_id_ctrl failed, err=%d\n", + f->file_name, ret); + goto out; + } + + is_fdp = le32_to_cpu(id.ctratt) & (1 << 19); + if (!is_fdp) + goto out; + + ret = nvme_dir_id(fd, data->nsid, &dir); + if (ret) { + log_err("%s: nvme_dir_id failed, err=%d\n", + f->file_name, ret); + is_fdp = false; + goto out; + } + + is_fdp = dir.enabled[0] & (1 << 2); +out: + *fdp = is_fdp; + close(fd); + return ret; +} + +static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid, + __u32 data_len, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_cmd_io_mgmt_recv, + .nsid = nsid, + .addr = (__u64)(uintptr_t)data, + .data_len = data_len, + .cdw10 = 1, + .cdw11 = (data_len >> 2) - 1, + }; + + return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd); +} + +int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f, + struct nvme_fdp_ruh_status *ruhs, __u32 bytes) +{ + struct nvme_data *data = FILE_ENG_DATA(f); + int fd, ret = 0; + + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); + if (fd < 0) + return -errno; + + ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs); + close(fd); + return ret; +} diff --git a/engines/nvme.h b/engines/nvme.h index 70a89b74..574f16d8 100644 --- a/engines/nvme.h +++ b/engines/nvme.h @@ -50,6 +50,7 @@ struct nvme_uring_cmd { enum nvme_identify_cns { NVME_IDENTIFY_CNS_NS = 0x00, + NVME_IDENTIFY_CNS_CTRL = 0x01, NVME_IDENTIFY_CNS_CSI_NS = 0x05, NVME_IDENTIFY_CNS_CSI_CTRL = 0x06, }; @@ -62,11 +63,13 @@ enum nvme_csi { enum nvme_admin_opcode { nvme_admin_identify = 0x06, + nvme_admin_directive_recv = 0x1a, }; enum nvme_io_opcode { nvme_cmd_write = 0x01, nvme_cmd_read = 0x02, + nvme_cmd_io_mgmt_recv = 0x12, nvme_zns_cmd_mgmt_send = 0x79, nvme_zns_cmd_mgmt_recv = 0x7a, }; @@ -86,6 +89,126 @@ struct nvme_data { __u32 lba_shift; }; +struct nvme_id_psd { + __le16 mp; + __u8 rsvd2; + __u8 flags; + __le32 enlat; + __le32 exlat; + __u8 rrt; + __u8 rrl; + __u8 rwt; + __u8 rwl; + __le16 idlp; + __u8 ips; + __u8 rsvd19; + __le16 actp; + __u8 apws; + __u8 rsvd23[9]; +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 cmic; + __u8 mdts; + __le16 cntlid; + __le32 ver; + __le32 rtd3r; + __le32 rtd3e; + __le32 oaes; + __le32 ctratt; + __le16 rrls; + __u8 rsvd102[9]; + __u8 cntrltype; + __u8 fguid[16]; + __le16 crdt1; + __le16 crdt2; + __le16 crdt3; + __u8 rsvd134[119]; + __u8 nvmsr; + __u8 vwci; + __u8 mec; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 avscc; + __u8 apsta; + __le16 wctemp; + __le16 cctemp; + __le16 mtfa; + __le32 hmpre; + __le32 hmmin; + __u8 tnvmcap[16]; + __u8 unvmcap[16]; + __le32 rpmbs; + __le16 edstt; + __u8 dsto; + __u8 fwug; + __le16 kas; + __le16 hctma; + __le16 mntmt; + __le16 mxtmt; + __le32 sanicap; + __le32 hmminds; + __le16 hmmaxd; + __le16 nsetidmax; + __le16 endgidmax; + __u8 anatt; + __u8 anacap; + __le32 anagrpmax; + __le32 nanagrpid; + __le32 pels; + __le16 domainid; + __u8 rsvd358[10]; + __u8 megcap[16]; + __u8 rsvd384[128]; + __u8 sqes; + __u8 cqes; + __le16 maxcmd; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 icsvscc; + __u8 nwpc; + __le16 acwu; + __le16 ocfs; + __le32 sgls; + __le32 mnan; + __u8 maxdna[16]; + __le32 maxcna; + __u8 rsvd564[204]; + char subnqn[256]; + __u8 rsvd1024[768]; + + /* Fabrics Only */ + __le32 ioccsz; + __le32 iorcsz; + __le16 icdoff; + __u8 fcatt; + __u8 msdbd; + __le16 ofcs; + __u8 dctype; + __u8 rsvd1807[241]; + + struct nvme_id_psd psd[32]; + __u8 vs[1024]; +}; + + struct nvme_lbaf { __le16 ms; __u8 ds; @@ -192,6 +315,31 @@ struct nvme_zone_report { struct nvme_zns_desc entries[]; }; +struct nvme_id_directives { + __u8 supported[32]; + __u8 enabled[32]; + __u8 rsvd64[4032]; +}; + +struct nvme_fdp_ruh_status_desc { + __u16 pid; + __u16 ruhid; + __u32 earutr; + __u64 ruamw; + __u8 rsvd16[16]; +}; + +struct nvme_fdp_ruh_status { + __u8 rsvd0[14]; + __le16 nruhsd; + struct nvme_fdp_ruh_status_desc ruhss[]; +}; + +int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f, + struct nvme_fdp_ruh_status *ruhs, __u32 bytes); + +int fio_nvme_is_fdp(struct thread_data *td, struct fio_file *f, bool *fdp); + int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz, __u64 *nlba); diff --git a/examples/uring-cmd-fdp.fio b/examples/uring-cmd-fdp.fio new file mode 100644 index 00000000..55d741d3 --- /dev/null +++ b/examples/uring-cmd-fdp.fio @@ -0,0 +1,37 @@ +# io_uring_cmd I/O engine for nvme-ns generic character device with FDP enabled +# This assumes the namespace is already configured with FDP support and has at +# least 8 available reclaim units. +# +# Each job targets different ranges of LBAs with different placement +# identifiers, and has different write intensity. + +[global] +filename=/dev/ng0n1 +ioengine=io_uring_cmd +cmd_type=nvme +iodepth=32 +bs=4K +fdp=1 +time_based=1 +runtime=1000 + +[write-heavy] +rw=randrw +rwmixwrite=90 +fdp_pli=0,1,2,3 +offset=0% +size=30% + +[write-mid] +rw=randrw +rwmixwrite=30 +fdp_pli=4,5 +offset=30% +size=30% + +[write-light] +rw=randrw +rwmixwrite=10 +fdp_pli=6 +offset=60% +size=30% diff --git a/file.h b/file.h index da1b8947..deb36e02 100644 --- a/file.h +++ b/file.h @@ -12,6 +12,7 @@ /* Forward declarations */ struct zoned_block_device_info; +struct fdp_ruh_info; /* * The type of object we are working on @@ -101,6 +102,8 @@ struct fio_file { uint64_t file_offset; uint64_t io_size; + struct fio_ruhs_info *ruhs_info; + /* * Zoned block device information. See also zonemode=zbd. */ diff --git a/filesetup.c b/filesetup.c index cb7047c5..c1f38858 100644 --- a/filesetup.c +++ b/filesetup.c @@ -1417,6 +1417,12 @@ done: td_restore_runstate(td, old_state); + if (td->o.fdp) { + err = fdp_init(td); + if (err) + goto err_out; + } + return 0; err_offset: @@ -1627,6 +1633,7 @@ void close_and_free_files(struct thread_data *td) } zbd_close_file(f); + fdp_free_ruhs_info(f); fio_file_free(f); } diff --git a/io_u.c b/io_u.c index 8035f4b7..60be4f01 100644 --- a/io_u.c +++ b/io_u.c @@ -980,6 +980,9 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u) return 1; } + if (td->o.fdp) + fdp_fill_dspec_data(td, io_u); + if (io_u->offset + io_u->buflen > io_u->file->real_file_size) { dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n", io_u, diff --git a/io_u.h b/io_u.h index 206e24fe..13b26d37 100644 --- a/io_u.h +++ b/io_u.h @@ -117,6 +117,9 @@ struct io_u { */ int (*end_io)(struct thread_data *, struct io_u **); + uint32_t dtype; + uint32_t dspec; + union { #ifdef CONFIG_LIBAIO struct iocb iocb; diff --git a/ioengines.h b/ioengines.h index 2cb9743e..4a9284c0 100644 --- a/ioengines.h +++ b/ioengines.h @@ -7,6 +7,7 @@ #include "flist.h" #include "io_u.h" #include "zbd_types.h" +#include "fdp.h" #define FIO_IOOPS_VERSION 31 @@ -63,6 +64,9 @@ struct ioengine_ops { unsigned int *); int (*finish_zone)(struct thread_data *, struct fio_file *, uint64_t, uint64_t); + int (*fdp_support)(struct thread_data *, struct fio_file *, bool *); + int (*fetch_ruhs)(struct thread_data *, struct fio_file *, + struct fio_ruhs_info *); int option_struct_size; struct fio_option *options; }; diff --git a/options.c b/options.c index 49612345..3e6dc3c6 100644 --- a/options.c +++ b/options.c @@ -251,6 +251,34 @@ int str_split_parse(struct thread_data *td, char *str, return ret; } +static int fio_fdp_cmp(const void *p1, const void *p2) +{ + const uint16_t *t1 = p1; + const uint16_t *t2 = p2; + + return *t1 - *t2; +} + +static int str_fdp_pli_cb(void *data, const char *input) +{ + struct thread_data *td = cb_data_to_td(data); + char *str, *p, *v; + int i = 0; + + p = str = strdup(input); + strip_blank_front(&str); + strip_blank_end(str); + + while ((v = strsep(&str, ",")) != NULL && i < FIO_MAX_PLIS) + td->o.plis[i++] = strtoll(v, NULL, 0); + free(p); + + td->o.nrpli = i; + qsort(td->o.plis, td->o.nrpli, sizeof(*td->o.plis), fio_fdp_cmp); + + return 0; +} + static int str_bssplit_cb(void *data, const char *input) { struct thread_data *td = cb_data_to_td(data); @@ -3649,6 +3677,27 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .category = FIO_OPT_C_IO, .group = FIO_OPT_G_ZONE, }, + { + .name = "fdp", + .lname = "Flexible data placement", + .type = FIO_OPT_BOOL, + .off1 = offsetof(struct thread_options, fdp), + .help = "Use Data placement directive (FDP)", + .def = "0", + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + }, + { + .name = "fdp_pli", + .lname = "FDP Placement ID indicies", + .type = FIO_OPT_STR, + .cb = str_fdp_pli_cb, + .off1 = offsetof(struct thread_options, plis), + .help = "Sets which placement ids to use (defaults to all)", + .hide = 1, + .category = FIO_OPT_C_IO, + .group = FIO_OPT_G_INVALID, + }, { .name = "lockmem", .lname = "Lock memory", diff --git a/thread_options.h b/thread_options.h index 74e7ea45..34eb4d3f 100644 --- a/thread_options.h +++ b/thread_options.h @@ -386,6 +386,12 @@ struct thread_options { fio_fp64_t zrt; fio_fp64_t zrf; + unsigned int fdp; + +#define FIO_MAX_PLIS 16 + unsigned int plis[FIO_MAX_PLIS]; + unsigned int nrpli; + unsigned int log_entries; unsigned int log_prio; }; @@ -698,6 +704,8 @@ struct thread_options_pack { uint32_t log_entries; uint32_t log_prio; + uint32_t fdp; + /* * verify_pattern followed by buffer_pattern from the unpacked struct */ -- 2.30.2