.TP
.B IORING_SETUP_SQPOLL
When this flag is specified, a kernel thread is created to perform
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 20bc570..d16364c 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit {
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
+/* Use hybrid poll in iopoll process */
+#define IORING_SETUP_HYBRID_IOPOLL (1U << 17)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
diff --git a/test/Makefile b/test/Makefile
index dfbbcbe..ea9452c 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -116,6 +116,7 @@ test_srcs := \
iopoll.c \
iopoll-leak.c \
iopoll-overflow.c \
+ iopoll-hybridpoll.c \
io_uring_enter.c \
io_uring_passthrough.c \
io_uring_register.c \
diff --git a/test/iopoll-hybridpoll.c b/test/iopoll-hybridpoll.c
new file mode 100644
index 0000000..d7c08ae
--- /dev/null
+++ b/test/iopoll-hybridpoll.c
@@ -0,0 +1,544 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: basic read/write tests with
+ * hybrid polled IO, include iopoll and io_uring
+ * passthrough.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <sys/resource.h>
+#include "helpers.h"
+#include "liburing.h"
+#include "../src/syscall.h"
+#include "nvme.h"
+
+#define FILE_SIZE (128 * 1024)
+#define BS 4096
+#define BUFFERS (FILE_SIZE / BS)
+
+static struct iovec *vecs;
+static int no_pt, no_iopoll;
+
+static int fill_pattern(int tc)
+{
+ unsigned int val, *ptr;
+ int i, j;
+ int u_in_buf = BS / sizeof(val);
+
+ val = (tc / 2) * FILE_SIZE;
+ for (i = 0; i < BUFFERS; i++) {
+ ptr = vecs[i].iov_base;
+ for (j = 0; j < u_in_buf; j++) {
+ *ptr = val;
+ val++;
+ ptr++;
+ }
+ }
+
+ return 0;
+}
+
+static int verify_buf(int tc, void *buf, off_t off)
+{
+ int i, u_in_buf = BS / sizeof(unsigned int);
+ unsigned int *ptr;
+
+ off /= sizeof(unsigned int);
+ off += (tc / 2) * FILE_SIZE;
+ ptr = buf;
+ for (i = 0; i < u_in_buf; i++) {
+ if (off != *ptr) {
+ fprintf(stderr, "Found %u, wanted %llu\n", *ptr,
+ (unsigned long long) off);
+ return 1;
+ }
+ ptr++;
+ off++;
+ }
+
+ return 0;
+}
+
+static int __test_io_uring_passthrough_io(const char *file, struct io_uring *ring, int tc, int write,
+ int sqthread, int fixed, int nonvec)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ struct nvme_uring_cmd *cmd;
+ int open_flags;
+ int do_fixed;
+ int i, ret, fd = -1;
+ off_t offset;
+ __u64 slba;
+ __u32 nlb;
+
+ if (write)
+ open_flags = O_WRONLY;
+ else
+ open_flags = O_RDONLY;
+
+ if (fixed) {
+ ret = t_register_buffers(ring, vecs, BUFFERS);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "buffer reg failed: %d\n", ret);
+ goto err;
+ }
+ }
+
+ fd = open(file, open_flags);
+ if (fd < 0) {
+ if (errno == EACCES || errno == EPERM)
+ return T_EXIT_SKIP;
+ perror("file open");
+ goto err;
+ }
+
+ if (sqthread) {
+ ret = io_uring_register_files(ring, &fd, 1);
+ if (ret) {
+ fprintf(stderr, "file reg failed: %d\n", ret);
+ goto err;
+ }
+ }
+
+ if (write)
+ fill_pattern(tc);
+
+ offset = 0;
+ for (i = 0; i < BUFFERS; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "sqe get failed\n");
+ goto err;
+ }
+ if (write) {
+ int use_fd = fd;
+
+ do_fixed = fixed;
+
+ if (sqthread)
+ use_fd = 0;
+ if (fixed && (i & 1))
+ do_fixed = 0;
+ if (do_fixed) {
+ io_uring_prep_write_fixed(sqe, use_fd, vecs[i].iov_base,
+ vecs[i].iov_len,
+ offset, i);
+ sqe->cmd_op = NVME_URING_CMD_IO;
+ } else if (nonvec) {
+ io_uring_prep_write(sqe, use_fd, vecs[i].iov_base,
+ vecs[i].iov_len, offset);
+ sqe->cmd_op = NVME_URING_CMD_IO;
+ } else {
+ io_uring_prep_writev(sqe, use_fd, &vecs[i], 1,
+ offset);
+ sqe->cmd_op = NVME_URING_CMD_IO_VEC;
+ }
+ } else {
+ int use_fd = fd;
+
+ do_fixed = fixed;
+
+ if (sqthread)
+ use_fd = 0;
+ if (fixed && (i & 1))
+ do_fixed = 0;
+ if (do_fixed) {
+ io_uring_prep_read_fixed(sqe, use_fd, vecs[i].iov_base,
+ vecs[i].iov_len,
+ offset, i);
+ sqe->cmd_op = NVME_URING_CMD_IO;
+ } else if (nonvec) {
+ io_uring_prep_read(sqe, use_fd, vecs[i].iov_base,
+ vecs[i].iov_len, offset);
+ sqe->cmd_op = NVME_URING_CMD_IO;
+ } else {
+ io_uring_prep_readv(sqe, use_fd, &vecs[i], 1,
+ offset);
+ sqe->cmd_op = NVME_URING_CMD_IO_VEC;
+ }
+ }
+ sqe->opcode = IORING_OP_URING_CMD;
+ if (do_fixed)
+ sqe->uring_cmd_flags |= IORING_URING_CMD_FIXED;
+ sqe->user_data = ((uint64_t)offset << 32) | i;
+ if (sqthread)
+ sqe->flags |= IOSQE_FIXED_FILE;
+
+ cmd = (struct nvme_uring_cmd *)sqe->cmd;
+ memset(cmd, 0, sizeof(struct nvme_uring_cmd));
+
+ cmd->opcode = write ? nvme_cmd_write : nvme_cmd_read;
+
+ slba = offset >> lba_shift;
+ nlb = (BS >> lba_shift) - 1;
+
+ /* cdw10 and cdw11 represent starting lba */
+ cmd->cdw10 = slba & 0xffffffff;
+ cmd->cdw11 = slba >> 32;
+ /* cdw12 represent number of lba's for read/write */
+ cmd->cdw12 = nlb;
+ if (do_fixed || nonvec) {
+ cmd->addr = (__u64)(uintptr_t)vecs[i].iov_base;
+ cmd->data_len = vecs[i].iov_len;
+ } else {
+ cmd->addr = (__u64)(uintptr_t)&vecs[i];
+ cmd->data_len = 1;
+ }
+ cmd->nsid = nsid;
+
+ offset += BS;
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != BUFFERS) {
+ fprintf(stderr, "submit got %d, wanted %d\n", ret, BUFFERS);
+ goto err;
+ }
+
+ for (i = 0; i < BUFFERS; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe=%d\n", ret);
+ goto err;
+ }
+ if (cqe->res != 0) {
+ if (!no_pt) {
+ no_pt = 1;
+ goto skip;
+ }
+ fprintf(stderr, "cqe res %d, wanted 0\n", cqe->res);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ if (!write) {
+ int index = cqe->user_data & 0xffffffff;
+ void *buf = vecs[index].iov_base;
+ off_t voff = cqe->user_data >> 32;
+
+ if (verify_buf(tc, buf, voff))
+ goto err;
+ }
+ }
+
+ if (fixed) {
+ ret = io_uring_unregister_buffers(ring);
+ if (ret) {
+ fprintf(stderr, "buffer unreg failed: %d\n", ret);
+ goto err;
+ }
+ }
+ if (sqthread) {
+ ret = io_uring_unregister_files(ring);
+ if (ret) {
+ fprintf(stderr, "file unreg failed: %d\n", ret);
+ goto err;
+ }
+ }
+
+skip:
+ close(fd);
+ return 0;
+err:
+ if (fd != -1)
+ close(fd);
+ return 1;
+}
+
+static int test_io_uring_passthrough(const char *file, int tc, int write, int sqthread,
+ int fixed, int nonvec)
+{
+ struct io_uring ring;
+ int ret, ring_flags = 0;
+
+ ring_flags |= IORING_SETUP_SQE128;
+ ring_flags |= IORING_SETUP_CQE32;
+ ring_flags |= IORING_SETUP_HYBRID_IOPOLL;
+
+ if (sqthread)
+ ring_flags |= IORING_SETUP_SQPOLL;
+
+ ret = t_create_ring(64, &ring, ring_flags);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ if (ret != T_SETUP_OK) {
+ if (ret == -EINVAL) {
+ no_pt = 1;
+ return T_SETUP_SKIP;
+ }
+ fprintf(stderr, "ring create failed: %d\n", ret);
+ return 1;
+ }
+
+ ret = __test_io_uring_passthrough_io(file, &ring, tc, write, sqthread, fixed, nonvec);
+ io_uring_queue_exit(&ring);
+
+ return ret;
+}
+
+static int provide_buffers(struct io_uring *ring)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int ret, i;
+
+ for (i = 0; i < BUFFERS; i++) {
+ sqe = io_uring_get_sqe(ring);
+ io_uring_prep_provide_buffers(sqe, vecs[i].iov_base,
+ vecs[i].iov_len, 1, 1, i);
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != BUFFERS) {
+ fprintf(stderr, "submit: %d\n", ret);
+ return 1;
+ }
+
+ for (i = 0; i < BUFFERS; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (cqe->res < 0) {
+ fprintf(stderr, "cqe->res=%d\n", cqe->res);
+ return 1;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ return 0;
+}
+
+static int __test_iopoll_io(const char *file, struct io_uring *ring, int write, int sqthread,
+ int fixed, int buf_select)
+{
+ struct io_uring_sqe *sqe;
+ struct io_uring_cqe *cqe;
+ int open_flags;
+ int i, fd = -1, ret;
+ off_t offset;
+
+ if (buf_select) {
+ write = 0;
+ fixed = 0;
+ }
+ if (buf_select && provide_buffers(ring))
+ return 1;
+
+ if (write)
+ open_flags = O_WRONLY;
+ else
+ open_flags = O_RDONLY;
+ open_flags |= O_DIRECT;
+
+ if (fixed) {
+ ret = t_register_buffers(ring, vecs, BUFFERS);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "buffer reg failed: %d\n", ret);
+ goto err;
+ }
+ }
+ fd = open(file, open_flags);
+ if (fd < 0) {
+ if (errno == EINVAL || errno == EPERM || errno == EACCES)
+ return 0;
+ perror("file open");
+ goto err;
+ }
+ if (sqthread) {
+ ret = io_uring_register_files(ring, &fd, 1);
+ if (ret) {
+ fprintf(stderr, "file reg failed: %d\n", ret);
+ goto err;
+ }
+ }
+
+ offset = 0;
+ for (i = 0; i < BUFFERS; i++) {
+ sqe = io_uring_get_sqe(ring);
+ if (!sqe) {
+ fprintf(stderr, "sqe get failed\n");
+ goto err;
+ }
+ offset = BS * (rand() % BUFFERS);
+ if (write) {
+ int do_fixed = fixed;
+ int use_fd = fd;
+
+ if (sqthread)
+ use_fd = 0;
+ if (fixed && (i & 1))
+ do_fixed = 0;
+ if (do_fixed) {
+ io_uring_prep_write_fixed(sqe, use_fd, vecs[i].iov_base,
+ vecs[i].iov_len,
+ offset, i);
+ } else {
+ io_uring_prep_writev(sqe, use_fd, &vecs[i], 1,
+ offset);
+ }
+ } else {
+ int do_fixed = fixed;
+ int use_fd = fd;
+
+ if (sqthread)
+ use_fd = 0;
+ if (fixed && (i & 1))
+ do_fixed = 0;
+ if (do_fixed) {
+ io_uring_prep_read_fixed(sqe, use_fd, vecs[i].iov_base,
+ vecs[i].iov_len,
+ offset, i);
+ } else {
+ io_uring_prep_readv(sqe, use_fd, &vecs[i], 1,
+ offset);
+ }
+
+ }
+ if (sqthread)
+ sqe->flags |= IOSQE_FIXED_FILE;
+ if (buf_select) {
+ sqe->flags |= IOSQE_BUFFER_SELECT;
+ sqe->buf_group = buf_select;
+ sqe->user_data = i;
+ }
+ }
+
+ ret = io_uring_submit(ring);
+ if (ret != BUFFERS) {
+ ret = io_uring_peek_cqe(ring, &cqe);
+ if (!ret && cqe->res == -EOPNOTSUPP) {
+ no_iopoll = 1;
+ io_uring_cqe_seen(ring, cqe);
+ goto out;
+ }
+ fprintf(stderr, "submit got %d, wanted %d\n", ret, BUFFERS);
+ goto err;
+ }
+
+ for (i = 0; i < BUFFERS; i++) {
+ ret = io_uring_wait_cqe(ring, &cqe);
+ if (ret) {
+ fprintf(stderr, "wait_cqe=%d\n", ret);
+ goto err;
+ } else if (cqe->res == -EOPNOTSUPP) {
+ fprintf(stdout, "File/device/fs doesn't support polled IO\n");
+ no_iopoll = 1;
+ goto out;
+ } else if (cqe->res != BS) {
+ fprintf(stderr, "cqe res %d, wanted %d\n", cqe->res, BS);
+ goto err;
+ }
+ io_uring_cqe_seen(ring, cqe);
+ }
+
+ if (fixed) {
+ ret = io_uring_unregister_buffers(ring);
+ if (ret) {
+ fprintf(stderr, "buffer unreg failed: %d\n", ret);
+ goto err;
+ }
+ }
+ if (sqthread) {
+ ret = io_uring_unregister_files(ring);
+ if (ret) {
+ fprintf(stderr, "file unreg failed: %d\n", ret);
+ goto err;
+ }
+ }
+
+out:
+ close(fd);
+ return 0;
+err:
+ if (fd != -1)
+ close(fd);
+ return 1;
+}
+
+static int test_iopoll(const char *fname, int write, int sqthread, int fixed,
+ int buf_select, int defer)
+{
+ struct io_uring ring;
+ int ret, ring_flags = IORING_SETUP_IOPOLL | IORING_SETUP_HYBRID_IOPOLL;
+
+ if (no_iopoll)
+ return 0;
+
+ if (defer)
+ ring_flags |= IORING_SETUP_SINGLE_ISSUER |
+ IORING_SETUP_DEFER_TASKRUN;
+
+ ret = t_create_ring(64, &ring, ring_flags);
+ if (ret == T_SETUP_SKIP)
+ return 0;
+ if (ret != T_SETUP_OK) {
+ fprintf(stderr, "ring create failed: %d\n", ret);
+ return 1;
+ }
+ ret = __test_iopoll_io(fname, &ring, write, sqthread, fixed, buf_select);
+ io_uring_queue_exit(&ring);
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, ret;
+ char buf[256];
+ char *fname;
+
+ if (argc > 1) {
+ fname = argv[1];
+ } else {
+ srand((unsigned)time(NULL));
+ snprintf(buf, sizeof(buf), ".basic-rw-%u-%u",
+ (unsigned)rand(), (unsigned)getpid());
+ fname = buf;
+ t_create_file(fname, FILE_SIZE);
+ }
+
+ vecs = t_create_buffers(BUFFERS, BS);
+
+ for (i = 0; i < 16; i++) {
+ int write = (i & 1) != 0;
+ int sqthread = (i & 2) != 0;
+ int fixed = (i & 4) != 0;
+ int buf_select = (i & 8) != 0;
+ int defer = (i & 16) != 0;
+ int nonvec = buf_select;
+
+ ret = test_iopoll(fname, write, sqthread, fixed, buf_select, defer);
+ if (ret) {
+ fprintf(stderr, "test_iopoll_io failed %d/%d/%d/%d/%d\n",
+ write, sqthread, fixed, buf_select, defer);
+ goto err;
+ }
+ if (no_iopoll)
+ break;
+
+ ret = test_io_uring_passthrough(fname, i, write, sqthread, fixed, nonvec);
+ if (no_pt)
+ break;
+ if (ret) {
+ fprintf(stderr, "test_io_uring_passthrough_io failed %d/%d/%d/%d\n",
+ write, sqthread, fixed, nonvec);
+ goto err;
+ }
+ }
+
+ if (fname != argv[1])
+ unlink(fname);
+ return ret;
+err:
+ if (fname != argv[1])
+ unlink(fname);
+ return T_EXIT_FAIL;
+}