Re: [PATCH 03/10] engines:io_uring: enable support for separate metadata buffer

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/9/23 07:13, Ankit Kumar wrote:
This patch enables support for separate metadata buffer.
As we are unaware of metadata size during buffer allocation, we provide
an option md_per_io_size. This can be used to specify metadata size for
single IO.

Signed-off-by: Ankit Kumar <ankit.kumar@xxxxxxxxxxx>
---
  HOWTO.rst          |  4 ++++
  engines/io_uring.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++
  engines/nvme.c     | 16 ++++++--------
  fio.1              |  3 +++
  4 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/HOWTO.rst b/HOWTO.rst
index ac8314f3..6e0677f2 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -2487,6 +2487,10 @@ with the caveat that when used on the command line, they must come after the
          want fio to use placement identifier only at indices 0, 2 and 5 specify
          ``fdp_pli=0,2,5``.
+.. option:: md_per_io_size=int : [io_uring_cmd]
+
+	Size in bytes for separate metadata buffer per IO. Default: 0.
+
  .. option:: cpuload=int : [cpuio]
Attempt to use the specified percentage of CPU cycles. This is a mandatory
diff --git a/engines/io_uring.c b/engines/io_uring.c
index 2b707927..0dfd5f17 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -59,6 +59,7 @@ struct ioring_data {
  	int ring_fd;
struct io_u **io_u_index;
+	char *md_buf;
int *fds; @@ -95,6 +96,7 @@ struct ioring_options {
  	unsigned int uncached;
  	unsigned int nowait;
  	unsigned int force_async;
+	unsigned int md_per_io_size;
  	enum uring_cmd_type cmd_type;
  };
@@ -217,6 +219,16 @@ static struct fio_option options[] = {
  		.group	= FIO_OPT_G_IOURING,
  	},
  	CMDPRIO_OPTIONS(struct ioring_options, FIO_OPT_G_IOURING),
+	{
+		.name	= "md_per_io_size",
+		.lname	= "Separate Metadata Buffer Size per I/O",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct ioring_options, md_per_io_size),
+		.def	= "0",
+		.help	= "Size of separate metadata buffer per I/O (Default: 0)",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
  	{
  		.name	= NULL,
  	},
@@ -631,6 +643,7 @@ static void fio_ioring_cleanup(struct thread_data *td)
fio_cmdprio_cleanup(&ld->cmdprio);
  		free(ld->io_u_index);
+		free(ld->md_buf);
  		free(ld->iovecs);
  		free(ld->fds);
  		free(ld->dsm);
@@ -1016,6 +1029,7 @@ static int fio_ioring_init(struct thread_data *td)
  {
  	struct ioring_options *o = td->eo;
  	struct ioring_data *ld;
+	unsigned long long md_size;
  	int ret;
/* sqthread submission requires registered files */
@@ -1036,6 +1050,28 @@ static int fio_ioring_init(struct thread_data *td)
/* io_u index */
  	ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
+
+	/*
+	 * metadata buffer for nvme command.
+	 * We are only supporting iomem=malloc / mem=malloc as of now.
+	 */
+	if (!strcmp(td->io_ops->name, "io_uring_cmd") &&
+	    (o->cmd_type == FIO_URING_CMD_NVME) && o->md_per_io_size) {
+		md_size = (unsigned long long) o->md_per_io_size
+				* (unsigned long long) td->o.iodepth;
+		md_size += page_mask + td->o.mem_align;
+		if (td->o.mem_align && td->o.mem_align > page_size)
+			md_size += td->o.mem_align - page_size;
+		if (td->o.mem_type == MEM_MALLOC) {
+			ld->md_buf = malloc(md_size);
+			if (!ld->md_buf)
+				return 1;
+		} else {
+			log_err("fio: bad mem type: %d\n", td->o.mem_type);

It would be more user friendly if the error message said something like, "Only iomem=malloc is supported."

+			return 1;
+		}
+	}
+
  	ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
td->io_ops_data = ld;
@@ -1062,8 +1098,17 @@ static int fio_ioring_init(struct thread_data *td)
  static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
  {
  	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	char *p;
ld->io_u_index[io_u->index] = io_u;
+
+	if (!strcmp(td->io_ops->name, "io_uring_cmd")) {
+		p = PTR_ALIGN(ld->md_buf, page_mask) + td->o.mem_align;
+		p += o->md_per_io_size * io_u->index;
+		io_u->mmap_data = p;
+	}
+
  	return 0;
  }
@@ -1116,6 +1161,13 @@ static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f)
  				td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
  				return 1;
  			}
+			if (data->ms && !data->lba_ext && ddir != DDIR_TRIM &&
+			    (o->md_per_io_size < ((td->o.max_bs[ddir] / data->lba_size) *
+						  data->ms))) {
+				log_err("md per io size should be sufficient for metadata payload\n");

It would be more user friendly if fio's error message included the md_per_io_size needed to run the job.

+				td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
+				return 1;
+			}
                  }
  	}
  	if (!ld || !o->registerfiles)
diff --git a/engines/nvme.c b/engines/nvme.c
index 7e891eed..65725e3c 100644
--- a/engines/nvme.c
+++ b/engines/nvme.c
@@ -79,6 +79,10 @@ int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
  		cmd->addr = (__u64)(uintptr_t)io_u->xfer_buf;
  		cmd->data_len = io_u->xfer_buflen;
  	}
+	if (data->lba_shift && data->ms) {
+		cmd->metadata = (__u64)(uintptr_t)io_u->mmap_data;
+		cmd->metadata_len = (nlb + 1) * data->ms;
+	}
  	cmd->nsid = data->nsid;
  	return 0;
  }
@@ -149,21 +153,13 @@ int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, struct nvme_data *data)
  		format_idx = (ns.flbas & 0xf) + (((ns.flbas >> 5) & 0x3) << 4);
data->lba_size = 1 << ns.lbaf[format_idx].ds;
+	data->ms = le16_to_cpu(ns.lbaf[format_idx].ms);
/*
-	 * Only extended LBA can be supported.
  	 * Bit 4 for flbas indicates if metadata is transferred at the end of
  	 * logical block creating an extended LBA.
  	 */
-	data->ms = le16_to_cpu(ns.lbaf[format_idx].ms);
-	if (data->ms && !((ns.flbas >> 4) & 0x1)) {
-		log_err("%s: only extended logical block can be supported\n",
-			f->file_name);
-		err = -ENOTSUP;
-		goto out;
-	}
-
-	if (data->ms)
+	if (data->ms && ((ns.flbas >> 4) & 0x1))
  		data->lba_ext = data->lba_size + data->ms;
  	else
  		data->lba_shift = ilog2(data->lba_size);
diff --git a/fio.1 b/fio.1
index f62617e7..6b49a747 100644
--- a/fio.1
+++ b/fio.1
@@ -2247,6 +2247,9 @@ By default, the job will cycle through all available Placement IDs, so use this
  to isolate these identifiers to specific jobs. If you want fio to use placement
  identifier only at indices 0, 2 and 5 specify, you would set `fdp_pli=0,2,5`.
  .TP
+.BI (io_uring_cmd)md_per_io_size \fR=\fPint
+Size in bytes for separate metadata buffer per IO. Default: 0.
+.TP
  .BI (cpuio)cpuload \fR=\fPint
  Attempt to use the specified percentage of CPU cycles. This is a mandatory
  option when using cpuio I/O engine.




[Index of Archives]     [Linux Kernel]     [Linux SCSI]     [Linux IDE]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]

  Powered by Linux