Re: [PATCH 2/2] t/io_uring: add support for async-passthru

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Aug 24, 2022 at 03:01:09PM +0530, Anuj Gupta wrote:
This patch adds support for async-passthru in t/io_uring. User needs to
specify -u1 option in the command

Example commandline:
t/io_uring -b512 -d128 -c32 -s32 -p0 -F1 -B0 -O0 -n1 -u1 /dev/ng0n1

Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx>
---
t/io_uring.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 138 insertions(+), 8 deletions(-)

diff --git a/t/io_uring.c b/t/io_uring.c
index a42abd46..78fedaf1 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -35,6 +35,7 @@
#include "../lib/rand.h"
#include "../minmax.h"
#include "../os/linux/io_uring.h"
+#include "../engines/nvme.h"

struct io_sq_ring {
	unsigned *head;
@@ -139,6 +140,9 @@ static int random_io = 1;	/* random or sequential IO */
static int register_ring = 1;	/* register ring */
static int use_sync = 0;	/* use preadv2 */
static int numa_placement = 0;	/* set to node of device */
+static int pt = 0;		/* passthrough I/O or not */
+static unsigned int nsid = 0;	/* nsid field required for nvme-passthrough */
+static unsigned int lbs = 0;	/* logical_block_size field required for nvme-passthrough */

static unsigned long tsc_rate;

@@ -161,6 +165,54 @@ struct io_uring_map_buffers {
};
#endif

+static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
+			 enum nvme_csi csi, void *data)
+{
+	struct nvme_passthru_cmd cmd = {
+		.opcode         = nvme_admin_identify,
+		.nsid           = nsid,
+		.addr           = (__u64)(uintptr_t)data,
+		.data_len       = NVME_IDENTIFY_DATA_SIZE,
+		.cdw10          = cns,
+		.cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
+		.timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+	};
+
+	return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
+}
+
+static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba)
+{
+	struct nvme_id_ns ns;
+	int namespace_id;
+	int err;
+
+	namespace_id = ioctl(fd, NVME_IOCTL_ID);
+	if (namespace_id < 0) {
+		fprintf(stderr, "error failed to fetch namespace-id\n");
+		close(fd);
+		return -errno;
+	}
+
+	/*
+	 * Identify namespace to get namespace-id, namespace size in LBA's
+	 * and LBA data size.
+	 */
+	err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
+				NVME_CSI_NVM, &ns);
+	if (err) {
+		fprintf(stderr, "error failed to fetch identify namespace\n");
+		close(fd);
+		return err;
+	}
+
+	*nsid = namespace_id;
+	*lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds;
+	*nlba = ns.nsze;
+
+	return 0;
+}
+
static unsigned long cycles_to_nsec(unsigned long cycles)
{
	uint64_t val;
@@ -457,12 +509,13 @@ static unsigned file_depth(struct submitter *s)

static void init_io(struct submitter *s, unsigned index)
{
-	struct io_uring_sqe *sqe = &s->sqes[index];
+	struct io_uring_sqe *sqe = NULL;
	unsigned long offset;
	struct file *f;
	long r;

	if (do_nop) {
+		sqe = &s->sqes[index];

Nit: this and previous change is not must as <see below>

		sqe->opcode = IORING_OP_NOP;
		return;
	}
@@ -490,6 +543,41 @@ static void init_io(struct submitter *s, unsigned index)
			f->cur_off = 0;
	}

+	if (pt) {
+		struct nvme_uring_cmd *cmd;
+		unsigned long long slba;
+		unsigned long long nlb;
+
+		sqe = &s->sqes[index<<1];

this is enough.

+		if (register_files) {
+			sqe->fd = f->fixed_fd;
+			sqe->flags = IOSQE_FIXED_FILE;
+		} else {
+			sqe->fd = f->real_fd;
+			sqe->flags = 0;
+		}
+		sqe->opcode = IORING_OP_URING_CMD;
+		sqe->user_data = (unsigned long) f->fileno;
+		if (stats)
+			sqe->user_data |= ((unsigned long)s->clock_index << 32);
+		sqe->cmd_op = NVME_URING_CMD_IO;
+		slba = offset / lbs;
+		nlb = bs/lbs - 1;
+		cmd = (struct nvme_uring_cmd *)&sqe->cmd;
+		memset(cmd, 0, sizeof(struct nvme_uring_cmd));
+		/* cdw10 and cdw11 represent starting slba*/
+		cmd->cdw10 = slba & 0xffffffff;
+		cmd->cdw11 = slba >> 32;
+		/* cdw12 represent number of lba to be read*/
+		cmd->cdw12 = nlb;
+		cmd->addr = (unsigned long) s->iovecs[index].iov_base;
+		cmd->data_len = bs;
+		cmd->nsid = nsid;
+		cmd->opcode = 2;
+		return;
+	}
+
+	sqe = &s->sqes[index];
	if (register_files) {
		sqe->flags = IOSQE_FIXED_FILE;
		sqe->fd = f->fixed_fd;
@@ -549,7 +637,22 @@ static int get_file_size(struct file *f)

	if (fstat(f->real_fd, &st) < 0)
		return -1;
-	if (S_ISBLK(st.st_mode)) {
+	if (pt) {
+		__u64 nlba;
+		int ret;
+
+		if (!S_ISCHR(st.st_mode)) {
+			fprintf(stderr, "passthrough works with only nvme-ns "
+					"generic devices (/dev/ngXnY)\n");
+			return -1;
+		}
+		ret = nvme_get_info(f->real_fd, &nsid, &lbs, &nlba);
+		if (ret)
+			return -1;
+		f->max_blocks = nlba / bs;
+		f->max_size = nlba;
+		return 0;
+	} else if (S_ISBLK(st.st_mode)) {
		unsigned long long bytes;

		if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
@@ -587,11 +690,14 @@ static int reap_events_uring(struct submitter *s)

			f = &s->files[fileno];
			f->pending_ios--;
-			if (cqe->res != bs) {
+			if (!pt && cqe->res != bs) {
				printf("io: unexpected ret=%d\n", cqe->res);
				if (polled && cqe->res == -EOPNOTSUPP)
					printf("Your filesystem/driver/kernel doesn't support polled IO\n");
				return -1;
+			} else if (pt && cqe->res != 0) {
+				printf("io: unexpected ret=%d\n", cqe->res);
+				return -1;

Missing inner check (polled one) for pt case?
How about something like this fragement -

               if (!do_nop) {
                       int fileno = cqe->user_data & 0xffffffff;
+                       bool error;

                       f = &s->files[fileno];
                       f->pending_ios--;
-                       if (!pt && cqe->res != bs) {
+                       if (!pt)
+                               error = (cqe->res != bs);
+                       else
+                               error = (cqe->res != 0);
+                       if (error) {
                               printf("io: unexpected ret=%d\n", cqe->res);
                               if (polled && cqe->res == -EOPNOTSUPP)
                                       printf("Your filesystem/driver/kernel doesn't support polled IO\n");
                               return -1;

[...]

@@ -1512,6 +1638,10 @@ int main(int argc, char *argv[])
			printf("failed getting size of device/file\n");
			return 1;
		}
+		if (pt && bs < lbs) {
+			printf("error: bs:%d is less than logical_block_size:%d\n", bs, lbs);
+			return 1;
+		}

would it be better to replace the check/print with "bs % lbs" instead?
since io size needs to be multiple of lbs to go through.






[Index of Archives]     [Linux Kernel]     [Linux SCSI]     [Linux IDE]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]

  Powered by Linux