On 2/9/23 07:20, Keith Busch wrote: > From: Keith Busch <kbusch@xxxxxxxxxx> > > NVMe TP4146 creates a new feature called Flexible Data Placement. This > feature allows a host to tell the device how to group write data through > the use of "Placement Identifiers" in write commands. > > Add support for using placement identifiers in write commands. The user > can enabled this with the new "fdp=1" parameter for fio's io_uring_cmd > ioengine. By default, the fio jobs will cycle through all the namespace's > available placement identifiers for write commands. The user can limit > which placement identifiers can be used with additional parameter, > "fdp_plis=<list,>", which can be used to separate write intensive jobs > from less intenstive ones. s/intenstive/intensive > > Setting up your namespace for FDP is outside the scope of 'fio', so this > assumes the namespace is already properly configured for the mode. > > Based-on-a-patch-by: Ankit Kumar <ankit.kumar@xxxxxxxxxxx> > Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx> > --- > Makefile | 2 +- > engines/io_uring.c | 31 ++++++++ > engines/nvme.c | 88 +++++++++++++++++++++- > engines/nvme.h | 148 +++++++++++++++++++++++++++++++++++++ > examples/uring-cmd-fdp.fio | 37 ++++++++++ > file.h | 3 + > filesetup.c | 7 ++ > io_u.c | 3 + > io_u.h | 3 + > ioengines.h | 4 + > options.c | 49 ++++++++++++ > thread_options.h | 8 ++ > 12 files changed, 381 insertions(+), 2 deletions(-) > create mode 100644 examples/uring-cmd-fdp.fio > > diff --git a/Makefile b/Makefile > index 5f4e6562..89205ebf 100644 > --- a/Makefile > +++ b/Makefile > @@ -62,7 +62,7 @@ SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \ > gettime-thread.c helpers.c json.c idletime.c td_error.c \ > profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \ > workqueue.c rate-submit.c optgroup.c helper_thread.c \ > - steadystate.c zone-dist.c zbd.c dedupe.c > + steadystate.c zone-dist.c zbd.c dedupe.c fdp.c > > ifdef CONFIG_LIBHDFS > HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE) > diff --git a/engines/io_uring.c b/engines/io_uring.c > index a9abd11d..644b279f 100644 > --- a/engines/io_uring.c > +++ b/engines/io_uring.c > @@ -1262,6 +1262,34 @@ static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td, > return fio_nvme_get_max_open_zones(td, f, max_open_zones); > } > > +static int fio_ioring_cmd_fdp_support(struct thread_data *td, struct fio_file *f, > + bool *support) > +{ > + return fio_nvme_is_fdp(td, f, support); > +} > + > +static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f, > + struct fio_ruhs_info *fruhs_info) > +{ > + struct nvme_fdp_ruh_status *ruhs; > + int bytes, ret, i; > + > + bytes = sizeof(*ruhs) + 1024 * sizeof(struct nvme_fdp_ruh_status_desc); > + ruhs = malloc(bytes); > + memset(ruhs, 0, bytes); > + > + ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes); > + if (ret) > + goto free; > + > + fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd); > + for (i = 0; i < fruhs_info->nr_ruhs; i++) > + fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid); > +free: > + free(ruhs); > + return ret; > +} > + > static struct ioengine_ops ioengine_uring = { > .name = "io_uring", > .version = FIO_IOOPS_VERSION, > @@ -1307,6 +1335,9 @@ static struct ioengine_ops ioengine_uring_cmd = { > .get_max_open_zones = fio_ioring_cmd_get_max_open_zones, > .options = options, > .option_struct_size = sizeof(struct ioring_options), > + > + .fdp_support = fio_ioring_cmd_fdp_support, > + .fetch_ruhs = fio_ioring_cmd_fetch_ruhs, > }; > > static void fio_init fio_ioring_register(void) > diff --git a/engines/nvme.c b/engines/nvme.c > index 9ffc5303..af4be733 100644 > --- a/engines/nvme.c > +++ b/engines/nvme.c > @@ -28,7 +28,8 @@ int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u, > cmd->cdw10 = slba & 0xffffffff; > cmd->cdw11 = slba >> 32; > /* cdw12 represent number of lba's for read/write */ > - cmd->cdw12 = nlb; > + cmd->cdw12 = nlb | (io_u->dtype << 20); > + cmd->cdw13 = io_u->dspec << 16; > if (iov) { > iov->iov_base = io_u->xfer_buf; > iov->iov_len = io_u->xfer_buflen; > @@ -345,3 +346,88 @@ out: > close(fd); > return ret; > } > + > +static inline int nvme_dir_id(int fd, __u32 nsid, struct nvme_id_directives *data) > +{ > + __u32 data_len = sizeof(*data); > + > + struct nvme_passthru_cmd cmd = { > + .opcode = nvme_admin_directive_recv, > + .nsid = nsid, > + .cdw10 = (data_len >> 2) - 1, > + .cdw11 = 1, > + .data_len = data_len, > + .addr = (__u64)(uintptr_t)data, > + }; > + > + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); > +} > + > +int fio_nvme_is_fdp(struct thread_data *td, struct fio_file *f, bool *fdp) > +{ > + struct nvme_data *data = FILE_ENG_DATA(f); > + struct nvme_id_directives dir; > + struct nvme_id_ctrl id; > + bool is_fdp = false; > + int fd, ret = 0; > + > + *fdp = false; > + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); > + if (fd < 0) > + return -errno; > + > + ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CTRL, 0, &id); > + if (ret) { > + log_err("%s: nvme_id_ctrl failed, err=%d\n", > + f->file_name, ret); > + goto out; > + } > + > + is_fdp = le32_to_cpu(id.ctratt) & (1 << 19); > + if (!is_fdp) > + goto out; > + > + ret = nvme_dir_id(fd, data->nsid, &dir); > + if (ret) { > + log_err("%s: nvme_dir_id failed, err=%d\n", > + f->file_name, ret); > + is_fdp = false; > + goto out; > + } > + > + is_fdp = dir.enabled[0] & (1 << 2); > +out: > + *fdp = is_fdp; > + close(fd); > + return ret; > +} > + > +static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid, > + __u32 data_len, void *data) > +{ > + struct nvme_passthru_cmd cmd = { > + .opcode = nvme_cmd_io_mgmt_recv, > + .nsid = nsid, > + .addr = (__u64)(uintptr_t)data, > + .data_len = data_len, > + .cdw10 = 1, > + .cdw11 = (data_len >> 2) - 1, > + }; > + > + return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd); > +} > + > +int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f, > + struct nvme_fdp_ruh_status *ruhs, __u32 bytes) > +{ > + struct nvme_data *data = FILE_ENG_DATA(f); > + int fd, ret = 0; > + > + fd = open(f->file_name, O_RDONLY | O_LARGEFILE); > + if (fd < 0) > + return -errno; > + > + ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs); > + close(fd); > + return ret; > +} > diff --git a/engines/nvme.h b/engines/nvme.h > index 70a89b74..574f16d8 100644 > --- a/engines/nvme.h > +++ b/engines/nvme.h > @@ -50,6 +50,7 @@ struct nvme_uring_cmd { > > enum nvme_identify_cns { > NVME_IDENTIFY_CNS_NS = 0x00, > + NVME_IDENTIFY_CNS_CTRL = 0x01, > NVME_IDENTIFY_CNS_CSI_NS = 0x05, > NVME_IDENTIFY_CNS_CSI_CTRL = 0x06, > }; > @@ -62,11 +63,13 @@ enum nvme_csi { > > enum nvme_admin_opcode { > nvme_admin_identify = 0x06, > + nvme_admin_directive_recv = 0x1a, > }; > > enum nvme_io_opcode { > nvme_cmd_write = 0x01, > nvme_cmd_read = 0x02, > + nvme_cmd_io_mgmt_recv = 0x12, > nvme_zns_cmd_mgmt_send = 0x79, > nvme_zns_cmd_mgmt_recv = 0x7a, > }; > @@ -86,6 +89,126 @@ struct nvme_data { > __u32 lba_shift; > }; > > +struct nvme_id_psd { > + __le16 mp; > + __u8 rsvd2; > + __u8 flags; > + __le32 enlat; > + __le32 exlat; > + __u8 rrt; > + __u8 rrl; > + __u8 rwt; > + __u8 rwl; > + __le16 idlp; > + __u8 ips; > + __u8 rsvd19; > + __le16 actp; > + __u8 apws; > + __u8 rsvd23[9]; > +}; > + > +struct nvme_id_ctrl { > + __le16 vid; > + __le16 ssvid; > + char sn[20]; > + char mn[40]; > + char fr[8]; > + __u8 rab; > + __u8 ieee[3]; > + __u8 cmic; > + __u8 mdts; > + __le16 cntlid; > + __le32 ver; > + __le32 rtd3r; > + __le32 rtd3e; > + __le32 oaes; > + __le32 ctratt; > + __le16 rrls; > + __u8 rsvd102[9]; > + __u8 cntrltype; > + __u8 fguid[16]; > + __le16 crdt1; > + __le16 crdt2; > + __le16 crdt3; > + __u8 rsvd134[119]; > + __u8 nvmsr; > + __u8 vwci; > + __u8 mec; > + __le16 oacs; > + __u8 acl; > + __u8 aerl; > + __u8 frmw; > + __u8 lpa; > + __u8 elpe; > + __u8 npss; > + __u8 avscc; > + __u8 apsta; > + __le16 wctemp; > + __le16 cctemp; > + __le16 mtfa; > + __le32 hmpre; > + __le32 hmmin; > + __u8 tnvmcap[16]; > + __u8 unvmcap[16]; > + __le32 rpmbs; > + __le16 edstt; > + __u8 dsto; > + __u8 fwug; > + __le16 kas; > + __le16 hctma; > + __le16 mntmt; > + __le16 mxtmt; > + __le32 sanicap; > + __le32 hmminds; > + __le16 hmmaxd; > + __le16 nsetidmax; > + __le16 endgidmax; > + __u8 anatt; > + __u8 anacap; > + __le32 anagrpmax; > + __le32 nanagrpid; > + __le32 pels; > + __le16 domainid; > + __u8 rsvd358[10]; > + __u8 megcap[16]; > + __u8 rsvd384[128]; > + __u8 sqes; > + __u8 cqes; > + __le16 maxcmd; > + __le32 nn; > + __le16 oncs; > + __le16 fuses; > + __u8 fna; > + __u8 vwc; > + __le16 awun; > + __le16 awupf; > + __u8 icsvscc; > + __u8 nwpc; > + __le16 acwu; > + __le16 ocfs; > + __le32 sgls; > + __le32 mnan; > + __u8 maxdna[16]; > + __le32 maxcna; > + __u8 rsvd564[204]; > + char subnqn[256]; > + __u8 rsvd1024[768]; > + > + /* Fabrics Only */ > + __le32 ioccsz; > + __le32 iorcsz; > + __le16 icdoff; > + __u8 fcatt; > + __u8 msdbd; > + __le16 ofcs; > + __u8 dctype; > + __u8 rsvd1807[241]; > + > + struct nvme_id_psd psd[32]; > + __u8 vs[1024]; > +}; > + > + > struct nvme_lbaf { > __le16 ms; > __u8 ds; > @@ -192,6 +315,31 @@ struct nvme_zone_report { > struct nvme_zns_desc entries[]; > }; > > +struct nvme_id_directives { > + __u8 supported[32]; > + __u8 enabled[32]; > + __u8 rsvd64[4032]; > +}; > + > +struct nvme_fdp_ruh_status_desc { > + __u16 pid; > + __u16 ruhid; > + __u32 earutr; > + __u64 ruamw; > + __u8 rsvd16[16]; > +}; > + > +struct nvme_fdp_ruh_status { > + __u8 rsvd0[14]; > + __le16 nruhsd; > + struct nvme_fdp_ruh_status_desc ruhss[]; > +}; > + > +int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f, > + struct nvme_fdp_ruh_status *ruhs, __u32 bytes); > + > +int fio_nvme_is_fdp(struct thread_data *td, struct fio_file *f, bool *fdp); > + > int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz, > __u64 *nlba); > > diff --git a/examples/uring-cmd-fdp.fio b/examples/uring-cmd-fdp.fio > new file mode 100644 > index 00000000..55d741d3 > --- /dev/null > +++ b/examples/uring-cmd-fdp.fio > @@ -0,0 +1,37 @@ > +# io_uring_cmd I/O engine for nvme-ns generic character device with FDP enabled > +# This assumes the namespace is already configured with FDP support and has at > +# least 8 available reclaim units. > +# > +# Each job targets different ranges of LBAs with different placement > +# identifiers, and has different write intensity. > + > +[global] > +filename=/dev/ng0n1 > +ioengine=io_uring_cmd > +cmd_type=nvme > +iodepth=32 > +bs=4K > +fdp=1 > +time_based=1 > +runtime=1000 > + > +[write-heavy] > +rw=randrw > +rwmixwrite=90 > +fdp_pli=0,1,2,3 > +offset=0% > +size=30% > + > +[write-mid] > +rw=randrw > +rwmixwrite=30 > +fdp_pli=4,5 > +offset=30% > +size=30% > + > +[write-light] > +rw=randrw > +rwmixwrite=10 > +fdp_pli=6 > +offset=60% > +size=30% > diff --git a/file.h b/file.h > index da1b8947..deb36e02 100644 > --- a/file.h > +++ b/file.h > @@ -12,6 +12,7 @@ > > /* Forward declarations */ > struct zoned_block_device_info; > +struct fdp_ruh_info; > > /* > * The type of object we are working on > @@ -101,6 +102,8 @@ struct fio_file { > uint64_t file_offset; > uint64_t io_size; > > + struct fio_ruhs_info *ruhs_info; > + > /* > * Zoned block device information. See also zonemode=zbd. > */ > diff --git a/filesetup.c b/filesetup.c > index cb7047c5..c1f38858 100644 > --- a/filesetup.c > +++ b/filesetup.c > @@ -1417,6 +1417,12 @@ done: > > td_restore_runstate(td, old_state); > > + if (td->o.fdp) { > + err = fdp_init(td); > + if (err) > + goto err_out; > + } > + > return 0; > > err_offset: > @@ -1627,6 +1633,7 @@ void close_and_free_files(struct thread_data *td) > } > > zbd_close_file(f); > + fdp_free_ruhs_info(f); > fio_file_free(f); > } > > diff --git a/io_u.c b/io_u.c > index 8035f4b7..60be4f01 100644 > --- a/io_u.c > +++ b/io_u.c > @@ -980,6 +980,9 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u) > return 1; > } > > + if (td->o.fdp) > + fdp_fill_dspec_data(td, io_u); > + > if (io_u->offset + io_u->buflen > io_u->file->real_file_size) { > dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n", > io_u, > diff --git a/io_u.h b/io_u.h > index 206e24fe..13b26d37 100644 > --- a/io_u.h > +++ b/io_u.h > @@ -117,6 +117,9 @@ struct io_u { > */ > int (*end_io)(struct thread_data *, struct io_u **); > > + uint32_t dtype; > + uint32_t dspec; > + > union { > #ifdef CONFIG_LIBAIO > struct iocb iocb; > diff --git a/ioengines.h b/ioengines.h > index 2cb9743e..4a9284c0 100644 > --- a/ioengines.h > +++ b/ioengines.h > @@ -7,6 +7,7 @@ > #include "flist.h" > #include "io_u.h" > #include "zbd_types.h" > +#include "fdp.h" > > #define FIO_IOOPS_VERSION 31 > > @@ -63,6 +64,9 @@ struct ioengine_ops { > unsigned int *); > int (*finish_zone)(struct thread_data *, struct fio_file *, > uint64_t, uint64_t); > + int (*fdp_support)(struct thread_data *, struct fio_file *, bool *); > + int (*fetch_ruhs)(struct thread_data *, struct fio_file *, > + struct fio_ruhs_info *); > int option_struct_size; > struct fio_option *options; > }; > diff --git a/options.c b/options.c > index 49612345..3e6dc3c6 100644 > --- a/options.c > +++ b/options.c > @@ -251,6 +251,34 @@ int str_split_parse(struct thread_data *td, char *str, > return ret; > } > > +static int fio_fdp_cmp(const void *p1, const void *p2) > +{ > + const uint16_t *t1 = p1; > + const uint16_t *t2 = p2; > + > + return *t1 - *t2; > +} > + > +static int str_fdp_pli_cb(void *data, const char *input) > +{ > + struct thread_data *td = cb_data_to_td(data); > + char *str, *p, *v; > + int i = 0; > + > + p = str = strdup(input); > + strip_blank_front(&str); > + strip_blank_end(str); > + > + while ((v = strsep(&str, ",")) != NULL && i < FIO_MAX_PLIS) > + td->o.plis[i++] = strtoll(v, NULL, 0); > + free(p); > + > + td->o.nrpli = i; > + qsort(td->o.plis, td->o.nrpli, sizeof(*td->o.plis), fio_fdp_cmp); > + > + return 0; > +} > + > static int str_bssplit_cb(void *data, const char *input) > { > struct thread_data *td = cb_data_to_td(data); > @@ -3649,6 +3677,27 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { > .category = FIO_OPT_C_IO, > .group = FIO_OPT_G_ZONE, > }, > + { > + .name = "fdp", > + .lname = "Flexible data placement", > + .type = FIO_OPT_BOOL, > + .off1 = offsetof(struct thread_options, fdp), > + .help = "Use Data placement directive (FDP)", > + .def = "0", > + .category = FIO_OPT_C_IO, > + .group = FIO_OPT_G_INVALID, > + }, > + { > + .name = "fdp_pli", > + .lname = "FDP Placement ID indicies", > + .type = FIO_OPT_STR, > + .cb = str_fdp_pli_cb, > + .off1 = offsetof(struct thread_options, plis), > + .help = "Sets which placement ids to use (defaults to all)", > + .hide = 1, > + .category = FIO_OPT_C_IO, > + .group = FIO_OPT_G_INVALID, > + }, > { > .name = "lockmem", > .lname = "Lock memory", > diff --git a/thread_options.h b/thread_options.h > index 74e7ea45..34eb4d3f 100644 > --- a/thread_options.h > +++ b/thread_options.h > @@ -386,6 +386,12 @@ struct thread_options { > fio_fp64_t zrt; > fio_fp64_t zrf; > > + unsigned int fdp; > + > +#define FIO_MAX_PLIS 16 > + unsigned int plis[FIO_MAX_PLIS]; > + unsigned int nrpli; > + > unsigned int log_entries; > unsigned int log_prio; > }; > @@ -698,6 +704,8 @@ struct thread_options_pack { > uint32_t log_entries; > uint32_t log_prio; > > + uint32_t fdp; > + > /* > * verify_pattern followed by buffer_pattern from the unpacked struct > */ -- Damien Le Moal Western Digital Research