On Wed, Aug 24, 2022 at 03:01:09PM +0530, Anuj Gupta wrote:
This patch adds support for async-passthru in t/io_uring. User needs to specify -u1 option in the command Example commandline: t/io_uring -b512 -d128 -c32 -s32 -p0 -F1 -B0 -O0 -n1 -u1 /dev/ng0n1 Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> --- t/io_uring.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 138 insertions(+), 8 deletions(-) diff --git a/t/io_uring.c b/t/io_uring.c index a42abd46..78fedaf1 100644 --- a/t/io_uring.c +++ b/t/io_uring.c @@ -35,6 +35,7 @@ #include "../lib/rand.h" #include "../minmax.h" #include "../os/linux/io_uring.h" +#include "../engines/nvme.h" struct io_sq_ring { unsigned *head; @@ -139,6 +140,9 @@ static int random_io = 1; /* random or sequential IO */ static int register_ring = 1; /* register ring */ static int use_sync = 0; /* use preadv2 */ static int numa_placement = 0; /* set to node of device */ +static int pt = 0; /* passthrough I/O or not */ +static unsigned int nsid = 0; /* nsid field required for nvme-passthrough */ +static unsigned int lbs = 0; /* logical_block_size field required for nvme-passthrough */ static unsigned long tsc_rate; @@ -161,6 +165,54 @@ struct io_uring_map_buffers { }; #endif +static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns, + enum nvme_csi csi, void *data) +{ + struct nvme_passthru_cmd cmd = { + .opcode = nvme_admin_identify, + .nsid = nsid, + .addr = (__u64)(uintptr_t)data, + .data_len = NVME_IDENTIFY_DATA_SIZE, + .cdw10 = cns, + .cdw11 = csi << NVME_IDENTIFY_CSI_SHIFT, + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, + }; + + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); +} + +static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba) +{ + struct nvme_id_ns ns; + int namespace_id; + int err; + + namespace_id = ioctl(fd, NVME_IOCTL_ID); + if (namespace_id < 0) { + fprintf(stderr, "error failed to fetch namespace-id\n"); + close(fd); + return -errno; + } + + /* + * Identify namespace to get namespace-id, namespace size in LBA's + * and LBA data size. + */ + err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS, + NVME_CSI_NVM, &ns); + if (err) { + fprintf(stderr, "error failed to fetch identify namespace\n"); + close(fd); + return err; + } + + *nsid = namespace_id; + *lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds; + *nlba = ns.nsze; + + return 0; +} + static unsigned long cycles_to_nsec(unsigned long cycles) { uint64_t val; @@ -457,12 +509,13 @@ static unsigned file_depth(struct submitter *s) static void init_io(struct submitter *s, unsigned index) { - struct io_uring_sqe *sqe = &s->sqes[index]; + struct io_uring_sqe *sqe = NULL; unsigned long offset; struct file *f; long r; if (do_nop) { + sqe = &s->sqes[index];
Nit: this and previous change is not must as <see below>
sqe->opcode = IORING_OP_NOP; return; } @@ -490,6 +543,41 @@ static void init_io(struct submitter *s, unsigned index) f->cur_off = 0; } + if (pt) { + struct nvme_uring_cmd *cmd; + unsigned long long slba; + unsigned long long nlb; + + sqe = &s->sqes[index<<1];
this is enough.
+ if (register_files) { + sqe->fd = f->fixed_fd; + sqe->flags = IOSQE_FIXED_FILE; + } else { + sqe->fd = f->real_fd; + sqe->flags = 0; + } + sqe->opcode = IORING_OP_URING_CMD; + sqe->user_data = (unsigned long) f->fileno; + if (stats) + sqe->user_data |= ((unsigned long)s->clock_index << 32); + sqe->cmd_op = NVME_URING_CMD_IO; + slba = offset / lbs; + nlb = bs/lbs - 1; + cmd = (struct nvme_uring_cmd *)&sqe->cmd; + memset(cmd, 0, sizeof(struct nvme_uring_cmd)); + /* cdw10 and cdw11 represent starting slba*/ + cmd->cdw10 = slba & 0xffffffff; + cmd->cdw11 = slba >> 32; + /* cdw12 represent number of lba to be read*/ + cmd->cdw12 = nlb; + cmd->addr = (unsigned long) s->iovecs[index].iov_base; + cmd->data_len = bs; + cmd->nsid = nsid; + cmd->opcode = 2; + return; + } + + sqe = &s->sqes[index]; if (register_files) { sqe->flags = IOSQE_FIXED_FILE; sqe->fd = f->fixed_fd; @@ -549,7 +637,22 @@ static int get_file_size(struct file *f) if (fstat(f->real_fd, &st) < 0) return -1; - if (S_ISBLK(st.st_mode)) { + if (pt) { + __u64 nlba; + int ret; + + if (!S_ISCHR(st.st_mode)) { + fprintf(stderr, "passthrough works with only nvme-ns " + "generic devices (/dev/ngXnY)\n"); + return -1; + } + ret = nvme_get_info(f->real_fd, &nsid, &lbs, &nlba); + if (ret) + return -1; + f->max_blocks = nlba / bs; + f->max_size = nlba; + return 0; + } else if (S_ISBLK(st.st_mode)) { unsigned long long bytes; if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0) @@ -587,11 +690,14 @@ static int reap_events_uring(struct submitter *s) f = &s->files[fileno]; f->pending_ios--; - if (cqe->res != bs) { + if (!pt && cqe->res != bs) { printf("io: unexpected ret=%d\n", cqe->res); if (polled && cqe->res == -EOPNOTSUPP) printf("Your filesystem/driver/kernel doesn't support polled IO\n"); return -1; + } else if (pt && cqe->res != 0) { + printf("io: unexpected ret=%d\n", cqe->res); + return -1;
Missing inner check (polled one) for pt case? How about something like this fragement - if (!do_nop) { int fileno = cqe->user_data & 0xffffffff; + bool error; f = &s->files[fileno]; f->pending_ios--; - if (!pt && cqe->res != bs) { + if (!pt) + error = (cqe->res != bs); + else + error = (cqe->res != 0); + if (error) { printf("io: unexpected ret=%d\n", cqe->res); if (polled && cqe->res == -EOPNOTSUPP) printf("Your filesystem/driver/kernel doesn't support polled IO\n"); return -1; [...]
@@ -1512,6 +1638,10 @@ int main(int argc, char *argv[]) printf("failed getting size of device/file\n"); return 1; } + if (pt && bs < lbs) { + printf("error: bs:%d is less than logical_block_size:%d\n", bs, lbs); + return 1; + }
would it be better to replace the check/print with "bs % lbs" instead? since io size needs to be multiple of lbs to go through.