On 24/08/2020 02:49, Jiufei Xue wrote: > ping... > > On 2020/8/4 下午5:28, Jiufei Xue wrote: >> Now users who want to get woken when waiting for events should submit a >> timeout command first. It is not safe for applications that split SQ and >> CQ handling between two threads, such as mysql. Users should synchronize >> the two threads explicitly to protect SQ and that will impact the >> performance. >> >> This patch adds support for timeout to existing io_uring_enter(). To >> avoid overloading arguments, it introduces a new parameter structure >> which contains sigmask and timeout. >> >> I have tested the workloads with one thread submiting nop requests >> while the other reaping the cqe with timeout. It shows 1.8~2x faster >> when the iodepth is 16. What happened with this? I thought there were enough people wanting such a thing. >> >> Signed-off-by: Jiufei Xue <jiufei.xue@xxxxxxxxxxxxxxxxx> >> --- >> fs/io_uring.c | 45 +++++++++++++++++++++++++++++++++++++------ >> include/uapi/linux/io_uring.h | 7 +++++++ >> 2 files changed, 46 insertions(+), 6 deletions(-) >> >> diff --git a/fs/io_uring.c b/fs/io_uring.c >> index 2a3af95..cdd89e4 100644 >> --- a/fs/io_uring.c >> +++ b/fs/io_uring.c >> @@ -6514,7 +6514,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, >> * application must reap them itself, as they reside on the shared cq ring. >> */ >> static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, >> - const sigset_t __user *sig, size_t sigsz) >> + const sigset_t __user *sig, size_t sigsz, >> + struct __kernel_timespec __user *uts) >> { >> struct io_wait_queue iowq = { >> .wq = { >> @@ -6526,6 +6527,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, >> .to_wait = min_events, >> }; >> struct io_rings *rings = ctx->rings; >> + struct timespec64 ts; >> + signed long timeout = 0; >> int ret = 0; >> >> do { >> @@ -6548,6 +6551,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, >> return ret; >> } >> >> + if (uts) { >> + if (get_timespec64(&ts, uts)) >> + return -EFAULT; >> + timeout = timespec64_to_jiffies(&ts); >> + } >> + >> iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); >> trace_io_uring_cqring_wait(ctx, min_events); >> do { >> @@ -6569,7 +6578,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, >> } >> if (io_should_wake(&iowq, false)) >> break; >> - schedule(); >> + if (uts) { >> + if ((timeout = schedule_timeout(timeout)) == 0) { >> + ret = -ETIME; >> + break; >> + } >> + } else { >> + schedule(); >> + } >> } while (1); >> finish_wait(&ctx->wait, &iowq.wq); >> >> @@ -7993,19 +8009,36 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, >> #endif /* !CONFIG_MMU */ >> >> SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, >> - u32, min_complete, u32, flags, const sigset_t __user *, sig, >> + u32, min_complete, u32, flags, const void __user *, argp, >> size_t, sigsz) >> { >> struct io_ring_ctx *ctx; >> long ret = -EBADF; >> int submitted = 0; >> struct fd f; >> + const sigset_t __user *sig; >> + struct __kernel_timespec __user *ts; >> + struct io_uring_getevents_arg arg; >> >> io_run_task_work(); >> >> - if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) >> + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | >> + IORING_ENTER_GETEVENTS_TIMEOUT)) >> return -EINVAL; >> >> + /* deal with IORING_ENTER_GETEVENTS_TIMEOUT */ >> + if (flags & IORING_ENTER_GETEVENTS_TIMEOUT) { >> + if (!(flags & IORING_ENTER_GETEVENTS)) >> + return -EINVAL; >> + if (copy_from_user(&arg, argp, sizeof(arg))) >> + return -EFAULT; >> + sig = arg.sigmask; >> + ts = arg.ts; >> + } else { >> + sig = (const sigset_t __user *)argp; >> + ts = NULL; >> + } >> + >> f = fdget(fd); >> if (!f.file) >> return -EBADF; >> @@ -8052,7 +8085,7 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, >> !(ctx->flags & IORING_SETUP_SQPOLL)) { >> ret = io_iopoll_check(ctx, min_complete); >> } else { >> - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); >> + ret = io_cqring_wait(ctx, min_complete, sig, sigsz, ts); >> } >> } >> >> @@ -8346,7 +8379,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, >> p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | >> IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | >> IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | >> - IORING_FEAT_POLL_32BITS; >> + IORING_FEAT_POLL_32BITS | IORING_FEAT_GETEVENTS_TIMEOUT; >> >> if (copy_to_user(params, p, sizeof(*p))) { >> ret = -EFAULT; >> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h >> index d65fde7..70764d2 100644 >> --- a/include/uapi/linux/io_uring.h >> +++ b/include/uapi/linux/io_uring.h >> @@ -224,6 +224,7 @@ struct io_cqring_offsets { >> */ >> #define IORING_ENTER_GETEVENTS (1U << 0) >> #define IORING_ENTER_SQ_WAKEUP (1U << 1) >> +#define IORING_ENTER_GETEVENTS_TIMEOUT (1U << 2) >> >> /* >> * Passed in for io_uring_setup(2). Copied back with updated info on success >> @@ -251,6 +252,7 @@ struct io_uring_params { >> #define IORING_FEAT_CUR_PERSONALITY (1U << 4) >> #define IORING_FEAT_FAST_POLL (1U << 5) >> #define IORING_FEAT_POLL_32BITS (1U << 6) >> +#define IORING_FEAT_GETEVENTS_TIMEOUT (1U << 7) >> >> /* >> * io_uring_register(2) opcodes and arguments >> @@ -290,4 +292,9 @@ struct io_uring_probe { >> struct io_uring_probe_op ops[0]; >> }; >> >> +struct io_uring_getevents_arg { >> + sigset_t *sigmask; >> + struct __kernel_timespec *ts; >> +}; >> + >> #endif >> -- Pavel Begunkov