[PATCH] fio: add nvme fdp support for io_uring_cmd engine

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Keith Busch <kbusch@xxxxxxxxxx>

NVMe TP4146 creates a new feature called Flexible Data Placement. This
feature allows a host to tell the device how to group write data through
the use of "Placement Identifiers" in write commands.

Add support for using placement identifiers in write commands. The user
can enabled this with the new "fdp=1" parameter for fio's io_uring_cmd
ioengine. By default, the fio jobs will cycle through all the namespace's
available placement identifiers for write commands. The user can limit
which placement identifiers can be used with additional parameter,
"fdp_plis=<list,>", which can be used to separate write intensive jobs
from less intenstive ones.

Setting up your namespace for FDP is outside the scope of 'fio', so this
assumes the namespace is already properly configured for the mode.

Based-on-a-patch-by: Ankit Kumar <ankit.kumar@xxxxxxxxxxx>
Signed-off-by: Keith Busch <kbusch@xxxxxxxxxx>
---
 Makefile                   |   2 +-
 engines/io_uring.c         |  31 ++++++++
 engines/nvme.c             |  88 +++++++++++++++++++++-
 engines/nvme.h             | 148 +++++++++++++++++++++++++++++++++++++
 examples/uring-cmd-fdp.fio |  37 ++++++++++
 file.h                     |   3 +
 filesetup.c                |   7 ++
 io_u.c                     |   3 +
 io_u.h                     |   3 +
 ioengines.h                |   4 +
 options.c                  |  49 ++++++++++++
 thread_options.h           |   8 ++
 12 files changed, 381 insertions(+), 2 deletions(-)
 create mode 100644 examples/uring-cmd-fdp.fio

diff --git a/Makefile b/Makefile
index 5f4e6562..89205ebf 100644
--- a/Makefile
+++ b/Makefile
@@ -62,7 +62,7 @@ SOURCE :=	$(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
 		gettime-thread.c helpers.c json.c idletime.c td_error.c \
 		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
 		workqueue.c rate-submit.c optgroup.c helper_thread.c \
-		steadystate.c zone-dist.c zbd.c dedupe.c
+		steadystate.c zone-dist.c zbd.c dedupe.c fdp.c
 
 ifdef CONFIG_LIBHDFS
   HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
diff --git a/engines/io_uring.c b/engines/io_uring.c
index a9abd11d..644b279f 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -1262,6 +1262,34 @@ static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
 	return fio_nvme_get_max_open_zones(td, f, max_open_zones);
 }
 
+static int fio_ioring_cmd_fdp_support(struct thread_data *td, struct fio_file *f,
+                                  bool *support)
+{
+	return fio_nvme_is_fdp(td, f, support);
+}
+
+static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f,
+                                 struct fio_ruhs_info *fruhs_info)
+{
+	struct nvme_fdp_ruh_status *ruhs;
+	int bytes, ret, i;
+
+	bytes = sizeof(*ruhs) + 1024 * sizeof(struct nvme_fdp_ruh_status_desc);
+	ruhs = malloc(bytes);
+	memset(ruhs, 0, bytes);
+
+	ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes);
+	if (ret)
+		goto free;
+
+        fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd);
+        for (i = 0; i < fruhs_info->nr_ruhs; i++)
+                fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid);
+free:
+	free(ruhs);
+	return ret;
+}
+
 static struct ioengine_ops ioengine_uring = {
 	.name			= "io_uring",
 	.version		= FIO_IOOPS_VERSION,
@@ -1307,6 +1335,9 @@ static struct ioengine_ops ioengine_uring_cmd = {
 	.get_max_open_zones	= fio_ioring_cmd_get_max_open_zones,
 	.options		= options,
 	.option_struct_size	= sizeof(struct ioring_options),
+
+        .fdp_support		= fio_ioring_cmd_fdp_support,
+        .fetch_ruhs		= fio_ioring_cmd_fetch_ruhs,
 };
 
 static void fio_init fio_ioring_register(void)
diff --git a/engines/nvme.c b/engines/nvme.c
index 9ffc5303..af4be733 100644
--- a/engines/nvme.c
+++ b/engines/nvme.c
@@ -28,7 +28,8 @@ int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
 	cmd->cdw10 = slba & 0xffffffff;
 	cmd->cdw11 = slba >> 32;
 	/* cdw12 represent number of lba's for read/write */
-	cmd->cdw12 = nlb;
+	cmd->cdw12 = nlb | (io_u->dtype << 20);
+	cmd->cdw13 = io_u->dspec << 16;
 	if (iov) {
 		iov->iov_base = io_u->xfer_buf;
 		iov->iov_len = io_u->xfer_buflen;
@@ -345,3 +346,88 @@ out:
 	close(fd);
 	return ret;
 }
+
+static inline int nvme_dir_id(int fd, __u32 nsid, struct nvme_id_directives *data)
+{
+	__u32 data_len = sizeof(*data);
+
+        struct nvme_passthru_cmd cmd = {
+		.opcode         = nvme_admin_directive_recv,
+		.nsid           = nsid,
+		.cdw10          = (data_len >> 2) - 1,
+		.cdw11          = 1,
+		.data_len       = data_len,
+		.addr           = (__u64)(uintptr_t)data,
+        };
+
+	return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
+}
+
+int fio_nvme_is_fdp(struct thread_data *td, struct fio_file *f, bool *fdp)
+{
+	struct nvme_data *data = FILE_ENG_DATA(f);
+	struct nvme_id_directives dir;
+	struct nvme_id_ctrl id;
+	bool is_fdp = false;
+	int fd, ret = 0;
+
+	*fdp = false;
+	fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+	if (fd < 0)
+		return -errno;
+
+	ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CTRL, 0, &id);
+	if (ret) {
+		log_err("%s: nvme_id_ctrl failed, err=%d\n",
+			f->file_name, ret);
+		goto out;
+	}
+
+	is_fdp = le32_to_cpu(id.ctratt) & (1 << 19);
+	if (!is_fdp)
+		goto out;
+
+	ret = nvme_dir_id(fd, data->nsid, &dir);
+	if (ret) {
+		log_err("%s: nvme_dir_id failed, err=%d\n",
+			f->file_name, ret);
+		is_fdp = false;
+		goto out;
+	}
+
+	is_fdp = dir.enabled[0] & (1 << 2);
+out:
+	*fdp = is_fdp;
+	close(fd);
+	return ret;
+}
+
+static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid,
+                        __u32 data_len, void *data)
+{
+	struct nvme_passthru_cmd cmd = {
+		.opcode		= nvme_cmd_io_mgmt_recv,
+		.nsid		= nsid,
+		.addr		= (__u64)(uintptr_t)data,
+		.data_len 	= data_len,
+		.cdw10		= 1,
+		.cdw11          = (data_len >> 2) - 1,
+	};
+
+	return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+}
+
+int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
+			 struct nvme_fdp_ruh_status *ruhs, __u32 bytes)
+{
+	struct nvme_data *data = FILE_ENG_DATA(f);
+	int fd, ret = 0;
+
+	fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+	if (fd < 0)
+		return -errno;
+
+	ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs);
+	close(fd);
+	return ret;
+}
diff --git a/engines/nvme.h b/engines/nvme.h
index 70a89b74..574f16d8 100644
--- a/engines/nvme.h
+++ b/engines/nvme.h
@@ -50,6 +50,7 @@ struct nvme_uring_cmd {
 
 enum nvme_identify_cns {
 	NVME_IDENTIFY_CNS_NS		= 0x00,
+	NVME_IDENTIFY_CNS_CTRL		= 0x01,
 	NVME_IDENTIFY_CNS_CSI_NS	= 0x05,
 	NVME_IDENTIFY_CNS_CSI_CTRL	= 0x06,
 };
@@ -62,11 +63,13 @@ enum nvme_csi {
 
 enum nvme_admin_opcode {
 	nvme_admin_identify		= 0x06,
+	nvme_admin_directive_recv	= 0x1a,
 };
 
 enum nvme_io_opcode {
 	nvme_cmd_write			= 0x01,
 	nvme_cmd_read			= 0x02,
+	nvme_cmd_io_mgmt_recv		= 0x12,
 	nvme_zns_cmd_mgmt_send		= 0x79,
 	nvme_zns_cmd_mgmt_recv		= 0x7a,
 };
@@ -86,6 +89,126 @@ struct nvme_data {
 	__u32 lba_shift;
 };
 
+struct nvme_id_psd {
+	__le16			mp;
+	__u8			rsvd2;
+	__u8			flags;
+	__le32                  enlat;
+	__le32                  exlat;
+	__u8			rrt;
+	__u8			rrl;
+	__u8			rwt;
+	__u8			rwl;
+	__le16			idlp;
+	__u8			ips;
+	__u8			rsvd19;
+	__le16			actp;
+	__u8			apws;
+	__u8			rsvd23[9];
+};
+
+struct nvme_id_ctrl {
+	__le16			vid;
+	__le16			ssvid;
+	char			sn[20];
+	char			mn[40];
+	char			fr[8];
+	__u8			rab;
+	__u8			ieee[3];
+	__u8			cmic;
+	__u8			mdts;
+	__le16			cntlid;
+	__le32			ver;
+	__le32			rtd3r;
+	__le32			rtd3e;
+	__le32			oaes;
+	__le32			ctratt;
+	__le16			rrls;
+	__u8			rsvd102[9];
+	__u8			cntrltype;
+	__u8			fguid[16];
+	__le16			crdt1;
+	__le16			crdt2;
+	__le16			crdt3;
+	__u8			rsvd134[119];
+	__u8			nvmsr;
+	__u8			vwci;
+	__u8			mec;
+	__le16			oacs;
+	__u8			acl;
+	__u8			aerl;
+	__u8			frmw;
+	__u8			lpa;
+	__u8			elpe;
+	__u8			npss;
+	__u8			avscc;
+	__u8			apsta;
+	__le16			wctemp;
+	__le16			cctemp;
+	__le16			mtfa;
+	__le32			hmpre;
+	__le32			hmmin;
+	__u8			tnvmcap[16];
+	__u8			unvmcap[16];
+	__le32			rpmbs;
+	__le16			edstt;
+	__u8			dsto;
+	__u8			fwug;
+	__le16			kas;
+	__le16			hctma;
+	__le16			mntmt;
+	__le16			mxtmt;
+	__le32			sanicap;
+	__le32			hmminds;
+	__le16			hmmaxd;
+	__le16			nsetidmax;
+	__le16			endgidmax;
+	__u8			anatt;
+	__u8			anacap;
+	__le32			anagrpmax;
+	__le32			nanagrpid;
+	__le32			pels;
+	__le16			domainid;
+	__u8			rsvd358[10];
+	__u8			megcap[16];
+	__u8			rsvd384[128];
+	__u8			sqes;
+	__u8			cqes;
+	__le16			maxcmd;
+	__le32			nn;
+	__le16			oncs;
+	__le16			fuses;
+	__u8			fna;
+	__u8			vwc;
+	__le16			awun;
+	__le16			awupf;
+	__u8			icsvscc;
+	__u8			nwpc;
+	__le16			acwu;
+	__le16			ocfs;
+	__le32			sgls;
+	__le32			mnan;
+	__u8			maxdna[16];
+	__le32			maxcna;
+	__u8			rsvd564[204];
+	char			subnqn[256];
+	__u8			rsvd1024[768];
+
+	/* Fabrics Only */
+	__le32			ioccsz;
+	__le32			iorcsz;
+	__le16			icdoff;
+	__u8			fcatt;
+	__u8			msdbd;
+	__le16			ofcs;
+	__u8			dctype;
+	__u8			rsvd1807[241];
+
+	struct nvme_id_psd	psd[32];
+	__u8			vs[1024];
+};
+
+
 struct nvme_lbaf {
 	__le16			ms;
 	__u8			ds;
@@ -192,6 +315,31 @@ struct nvme_zone_report {
 	struct nvme_zns_desc	entries[];
 };
 
+struct nvme_id_directives {
+	__u8	supported[32];
+	__u8	enabled[32];
+	__u8	rsvd64[4032];
+};
+
+struct nvme_fdp_ruh_status_desc {
+        __u16 pid;
+        __u16 ruhid;
+        __u32 earutr;
+        __u64 ruamw;
+        __u8  rsvd16[16];
+};
+
+struct nvme_fdp_ruh_status {
+        __u8  rsvd0[14];
+        __le16 nruhsd;
+        struct nvme_fdp_ruh_status_desc ruhss[];
+};
+
+int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
+			 struct nvme_fdp_ruh_status *ruhs, __u32 bytes);
+
+int fio_nvme_is_fdp(struct thread_data *td, struct fio_file *f, bool *fdp);
+
 int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz,
 		      __u64 *nlba);
 
diff --git a/examples/uring-cmd-fdp.fio b/examples/uring-cmd-fdp.fio
new file mode 100644
index 00000000..55d741d3
--- /dev/null
+++ b/examples/uring-cmd-fdp.fio
@@ -0,0 +1,37 @@
+# io_uring_cmd I/O engine for nvme-ns generic character device with FDP enabled
+# This assumes the namespace is already configured with FDP support and has at
+# least 8 available reclaim units.
+#
+# Each job targets different ranges of LBAs with different placement
+# identifiers, and has different write intensity.
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+iodepth=32
+bs=4K
+fdp=1
+time_based=1
+runtime=1000
+
+[write-heavy]
+rw=randrw
+rwmixwrite=90
+fdp_pli=0,1,2,3
+offset=0%
+size=30%
+
+[write-mid]
+rw=randrw
+rwmixwrite=30
+fdp_pli=4,5
+offset=30%
+size=30%
+
+[write-light]
+rw=randrw
+rwmixwrite=10
+fdp_pli=6
+offset=60%
+size=30%
diff --git a/file.h b/file.h
index da1b8947..deb36e02 100644
--- a/file.h
+++ b/file.h
@@ -12,6 +12,7 @@
 
 /* Forward declarations */
 struct zoned_block_device_info;
+struct fdp_ruh_info;
 
 /*
  * The type of object we are working on
@@ -101,6 +102,8 @@ struct fio_file {
 	uint64_t file_offset;
 	uint64_t io_size;
 
+	struct fio_ruhs_info *ruhs_info;
+
 	/*
 	 * Zoned block device information. See also zonemode=zbd.
 	 */
diff --git a/filesetup.c b/filesetup.c
index cb7047c5..c1f38858 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -1417,6 +1417,12 @@ done:
 
 	td_restore_runstate(td, old_state);
 
+	if (td->o.fdp) {
+		err = fdp_init(td);
+		if (err)
+			goto err_out;
+	}
+
 	return 0;
 
 err_offset:
@@ -1627,6 +1633,7 @@ void close_and_free_files(struct thread_data *td)
 		}
 
 		zbd_close_file(f);
+		fdp_free_ruhs_info(f);
 		fio_file_free(f);
 	}
 
diff --git a/io_u.c b/io_u.c
index 8035f4b7..60be4f01 100644
--- a/io_u.c
+++ b/io_u.c
@@ -980,6 +980,9 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u)
 			return 1;
 	}
 
+	if (td->o.fdp)
+		fdp_fill_dspec_data(td, io_u);
+
 	if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
 		dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n",
 			io_u,
diff --git a/io_u.h b/io_u.h
index 206e24fe..13b26d37 100644
--- a/io_u.h
+++ b/io_u.h
@@ -117,6 +117,9 @@ struct io_u {
 	 */
 	int (*end_io)(struct thread_data *, struct io_u **);
 
+	uint32_t dtype;
+	uint32_t dspec;
+
 	union {
 #ifdef CONFIG_LIBAIO
 		struct iocb iocb;
diff --git a/ioengines.h b/ioengines.h
index 2cb9743e..4a9284c0 100644
--- a/ioengines.h
+++ b/ioengines.h
@@ -7,6 +7,7 @@
 #include "flist.h"
 #include "io_u.h"
 #include "zbd_types.h"
+#include "fdp.h"
 
 #define FIO_IOOPS_VERSION	31
 
@@ -63,6 +64,9 @@ struct ioengine_ops {
 				  unsigned int *);
 	int (*finish_zone)(struct thread_data *, struct fio_file *,
 			   uint64_t, uint64_t);
+        int (*fdp_support)(struct thread_data *, struct fio_file *, bool *);
+        int (*fetch_ruhs)(struct thread_data *, struct fio_file *,
+                          struct fio_ruhs_info *);
 	int option_struct_size;
 	struct fio_option *options;
 };
diff --git a/options.c b/options.c
index 49612345..3e6dc3c6 100644
--- a/options.c
+++ b/options.c
@@ -251,6 +251,34 @@ int str_split_parse(struct thread_data *td, char *str,
 	return ret;
 }
 
+static int fio_fdp_cmp(const void *p1, const void *p2)
+{
+	const uint16_t *t1 = p1;
+	const uint16_t *t2 = p2;
+
+	return *t1 - *t2;
+}
+
+static int str_fdp_pli_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *str, *p, *v;
+	int i = 0;
+
+	p = str = strdup(input);
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	while ((v = strsep(&str, ",")) != NULL && i < FIO_MAX_PLIS)
+		td->o.plis[i++] = strtoll(v, NULL, 0);
+	free(p);
+
+	td->o.nrpli = i;
+	qsort(td->o.plis, td->o.nrpli, sizeof(*td->o.plis), fio_fdp_cmp);
+
+	return 0;
+}
+
 static int str_bssplit_cb(void *data, const char *input)
 {
 	struct thread_data *td = cb_data_to_td(data);
@@ -3649,6 +3677,27 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_ZONE,
 	},
+	{
+		.name   = "fdp",
+		.lname  = "Flexible data placement",
+		.type   = FIO_OPT_BOOL,
+		.off1   = offsetof(struct thread_options, fdp),
+		.help   = "Use Data placement directive (FDP)",
+		.def    = "0",
+		.category = FIO_OPT_C_IO,
+		.group  = FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "fdp_pli",
+		.lname	= "FDP Placement ID indicies",
+		.type	= FIO_OPT_STR,
+		.cb	= str_fdp_pli_cb,
+		.off1	= offsetof(struct thread_options, plis),
+		.help	= "Sets which placement ids to use (defaults to all)",
+		.hide	= 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
 	{
 		.name	= "lockmem",
 		.lname	= "Lock memory",
diff --git a/thread_options.h b/thread_options.h
index 74e7ea45..34eb4d3f 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -386,6 +386,12 @@ struct thread_options {
 	fio_fp64_t zrt;
 	fio_fp64_t zrf;
 
+	unsigned int fdp;
+
+#define FIO_MAX_PLIS 16
+	unsigned int plis[FIO_MAX_PLIS];
+	unsigned int nrpli;
+
 	unsigned int log_entries;
 	unsigned int log_prio;
 };
@@ -698,6 +704,8 @@ struct thread_options_pack {
 	uint32_t log_entries;
 	uint32_t log_prio;
 
+	uint32_t fdp;
+
 	/*
 	 * verify_pattern followed by buffer_pattern from the unpacked struct
 	 */
-- 
2.30.2





[Index of Archives]     [Linux Kernel]     [Linux SCSI]     [Linux IDE]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]

  Powered by Linux