On Thu, Aug 25, 2022 at 01:54:44PM +0530, Kanchan Joshi wrote: > On Wed, Aug 24, 2022 at 03:01:09PM +0530, Anuj Gupta wrote: > > This patch adds support for async-passthru in t/io_uring. User needs to > > specify -u1 option in the command > > > > Example commandline: > > t/io_uring -b512 -d128 -c32 -s32 -p0 -F1 -B0 -O0 -n1 -u1 /dev/ng0n1 > > > > Signed-off-by: Anuj Gupta <anuj20.g@xxxxxxxxxxx> > > --- > > t/io_uring.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++--- > > 1 file changed, 138 insertions(+), 8 deletions(-) > > > > diff --git a/t/io_uring.c b/t/io_uring.c > > index a42abd46..78fedaf1 100644 > > --- a/t/io_uring.c > > +++ b/t/io_uring.c > > @@ -35,6 +35,7 @@ > > #include "../lib/rand.h" > > #include "../minmax.h" > > #include "../os/linux/io_uring.h" > > +#include "../engines/nvme.h" > > > > struct io_sq_ring { > > unsigned *head; > > @@ -139,6 +140,9 @@ static int random_io = 1; /* random or sequential IO */ > > static int register_ring = 1; /* register ring */ > > static int use_sync = 0; /* use preadv2 */ > > static int numa_placement = 0; /* set to node of device */ > > +static int pt = 0; /* passthrough I/O or not */ > > +static unsigned int nsid = 0; /* nsid field required for nvme-passthrough */ > > +static unsigned int lbs = 0; /* logical_block_size field required for nvme-passthrough */ > > > > static unsigned long tsc_rate; > > > > @@ -161,6 +165,54 @@ struct io_uring_map_buffers { > > }; > > #endif > > > > +static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns, > > + enum nvme_csi csi, void *data) > > +{ > > + struct nvme_passthru_cmd cmd = { > > + .opcode = nvme_admin_identify, > > + .nsid = nsid, > > + .addr = (__u64)(uintptr_t)data, > > + .data_len = NVME_IDENTIFY_DATA_SIZE, > > + .cdw10 = cns, > > + .cdw11 = csi << NVME_IDENTIFY_CSI_SHIFT, > > + .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT, > > + }; > > + > > + return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd); > > +} > > + > > +static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba) > > +{ > > + struct nvme_id_ns ns; > > + int namespace_id; > > + int err; > > + > > + namespace_id = ioctl(fd, NVME_IOCTL_ID); > > + if (namespace_id < 0) { > > + fprintf(stderr, "error failed to fetch namespace-id\n"); > > + close(fd); > > + return -errno; > > + } > > + > > + /* > > + * Identify namespace to get namespace-id, namespace size in LBA's > > + * and LBA data size. > > + */ > > + err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS, > > + NVME_CSI_NVM, &ns); > > + if (err) { > > + fprintf(stderr, "error failed to fetch identify namespace\n"); > > + close(fd); > > + return err; > > + } > > + > > + *nsid = namespace_id; > > + *lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds; > > + *nlba = ns.nsze; > > + > > + return 0; > > +} > > + > > static unsigned long cycles_to_nsec(unsigned long cycles) > > { > > uint64_t val; > > @@ -457,12 +509,13 @@ static unsigned file_depth(struct submitter *s) > > > > static void init_io(struct submitter *s, unsigned index) > > { > > - struct io_uring_sqe *sqe = &s->sqes[index]; > > + struct io_uring_sqe *sqe = NULL; > > unsigned long offset; > > struct file *f; > > long r; > > > > if (do_nop) { > > + sqe = &s->sqes[index]; > > Nit: this and previous change is not must as <see below> > got it, will change this > > sqe->opcode = IORING_OP_NOP; > > return; > > } > > @@ -490,6 +543,41 @@ static void init_io(struct submitter *s, unsigned index) > > f->cur_off = 0; > > } > > > > + if (pt) { > > + struct nvme_uring_cmd *cmd; > > + unsigned long long slba; > > + unsigned long long nlb; > > + > > + sqe = &s->sqes[index<<1]; > > this is enough. > > > + if (register_files) { > > + sqe->fd = f->fixed_fd; > > + sqe->flags = IOSQE_FIXED_FILE; > > + } else { > > + sqe->fd = f->real_fd; > > + sqe->flags = 0; > > + } > > + sqe->opcode = IORING_OP_URING_CMD; > > + sqe->user_data = (unsigned long) f->fileno; > > + if (stats) > > + sqe->user_data |= ((unsigned long)s->clock_index << 32); > > + sqe->cmd_op = NVME_URING_CMD_IO; > > + slba = offset / lbs; > > + nlb = bs/lbs - 1; > > + cmd = (struct nvme_uring_cmd *)&sqe->cmd; > > + memset(cmd, 0, sizeof(struct nvme_uring_cmd)); > > + /* cdw10 and cdw11 represent starting slba*/ > > + cmd->cdw10 = slba & 0xffffffff; > > + cmd->cdw11 = slba >> 32; > > + /* cdw12 represent number of lba to be read*/ > > + cmd->cdw12 = nlb; > > + cmd->addr = (unsigned long) s->iovecs[index].iov_base; > > + cmd->data_len = bs; > > + cmd->nsid = nsid; > > + cmd->opcode = 2; > > + return; > > + } > > + > > + sqe = &s->sqes[index]; > > if (register_files) { > > sqe->flags = IOSQE_FIXED_FILE; > > sqe->fd = f->fixed_fd; > > @@ -549,7 +637,22 @@ static int get_file_size(struct file *f) > > > > if (fstat(f->real_fd, &st) < 0) > > return -1; > > - if (S_ISBLK(st.st_mode)) { > > + if (pt) { > > + __u64 nlba; > > + int ret; > > + > > + if (!S_ISCHR(st.st_mode)) { > > + fprintf(stderr, "passthrough works with only nvme-ns " > > + "generic devices (/dev/ngXnY)\n"); > > + return -1; > > + } > > + ret = nvme_get_info(f->real_fd, &nsid, &lbs, &nlba); > > + if (ret) > > + return -1; > > + f->max_blocks = nlba / bs; > > + f->max_size = nlba; > > + return 0; > > + } else if (S_ISBLK(st.st_mode)) { > > unsigned long long bytes; > > > > if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0) > > @@ -587,11 +690,14 @@ static int reap_events_uring(struct submitter *s) > > > > f = &s->files[fileno]; > > f->pending_ios--; > > - if (cqe->res != bs) { > > + if (!pt && cqe->res != bs) { > > printf("io: unexpected ret=%d\n", cqe->res); > > if (polled && cqe->res == -EOPNOTSUPP) > > printf("Your filesystem/driver/kernel doesn't support polled IO\n"); > > return -1; > > + } else if (pt && cqe->res != 0) { > > + printf("io: unexpected ret=%d\n", cqe->res); > > + return -1; > > Missing inner check (polled one) for pt case? right, adding a check for polling makes sense. > How about something like this fragement - > > if (!do_nop) { > int fileno = cqe->user_data & 0xffffffff; > + bool error; > > f = &s->files[fileno]; > f->pending_ios--; > - if (!pt && cqe->res != bs) { > + if (!pt) > + error = (cqe->res != bs); > + else > + error = (cqe->res != 0); > + if (error) { > printf("io: unexpected ret=%d\n", cqe->res); > if (polled && cqe->res == -EOPNOTSUPP) > printf("Your filesystem/driver/kernel doesn't support polled IO\n"); > return -1; > > [...] > I think, the result is -EINVAL rather than -EOPNOTSUPP, if the user enables polling for passthru i/o. With that we can check for polled-io for passthru > > @@ -1512,6 +1638,10 @@ int main(int argc, char *argv[]) > > printf("failed getting size of device/file\n"); > > return 1; > > } > > + if (pt && bs < lbs) { > > + printf("error: bs:%d is less than logical_block_size:%d\n", bs, lbs); > > + return 1; > > + } > > would it be better to replace the check/print with "bs % lbs" instead? > since io size needs to be multiple of lbs to go through. > right, will modify the check accordingly -- Anuj Gupta