Hi Jens, Could you please review this patch? Thinks, Jiufei On 2020/8/4 下午5:28, Jiufei Xue wrote: > Now users who want to get woken when waiting for events should submit a > timeout command first. It is not safe for applications that split SQ and > CQ handling between two threads, such as mysql. Users should synchronize > the two threads explicitly to protect SQ and that will impact the > performance. > > This patch adds support for timeout to existing io_uring_enter(). To > avoid overloading arguments, it introduces a new parameter structure > which contains sigmask and timeout. > > I have tested the workloads with one thread submiting nop requests > while the other reaping the cqe with timeout. It shows 1.8~2x faster > when the iodepth is 16. > > Signed-off-by: Jiufei Xue <jiufei.xue@xxxxxxxxxxxxxxxxx> > --- > fs/io_uring.c | 45 +++++++++++++++++++++++++++++++++++++------ > include/uapi/linux/io_uring.h | 7 +++++++ > 2 files changed, 46 insertions(+), 6 deletions(-) > > diff --git a/fs/io_uring.c b/fs/io_uring.c > index 2a3af95..cdd89e4 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -6514,7 +6514,8 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, > * application must reap them itself, as they reside on the shared cq ring. > */ > static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, > - const sigset_t __user *sig, size_t sigsz) > + const sigset_t __user *sig, size_t sigsz, > + struct __kernel_timespec __user *uts) > { > struct io_wait_queue iowq = { > .wq = { > @@ -6526,6 +6527,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, > .to_wait = min_events, > }; > struct io_rings *rings = ctx->rings; > + struct timespec64 ts; > + signed long timeout = 0; > int ret = 0; > > do { > @@ -6548,6 +6551,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, > return ret; > } > > + if (uts) { > + if (get_timespec64(&ts, uts)) > + return -EFAULT; > + timeout = timespec64_to_jiffies(&ts); > + } > + > iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); > trace_io_uring_cqring_wait(ctx, min_events); > do { > @@ -6569,7 +6578,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, > } > if (io_should_wake(&iowq, false)) > break; > - schedule(); > + if (uts) { > + if ((timeout = schedule_timeout(timeout)) == 0) { > + ret = -ETIME; > + break; > + } > + } else { > + schedule(); > + } > } while (1); > finish_wait(&ctx->wait, &iowq.wq); > > @@ -7993,19 +8009,36 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, > #endif /* !CONFIG_MMU */ > > SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, > - u32, min_complete, u32, flags, const sigset_t __user *, sig, > + u32, min_complete, u32, flags, const void __user *, argp, > size_t, sigsz) > { > struct io_ring_ctx *ctx; > long ret = -EBADF; > int submitted = 0; > struct fd f; > + const sigset_t __user *sig; > + struct __kernel_timespec __user *ts; > + struct io_uring_getevents_arg arg; > > io_run_task_work(); > > - if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) > + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | > + IORING_ENTER_GETEVENTS_TIMEOUT)) > return -EINVAL; > > + /* deal with IORING_ENTER_GETEVENTS_TIMEOUT */ > + if (flags & IORING_ENTER_GETEVENTS_TIMEOUT) { > + if (!(flags & IORING_ENTER_GETEVENTS)) > + return -EINVAL; > + if (copy_from_user(&arg, argp, sizeof(arg))) > + return -EFAULT; > + sig = arg.sigmask; > + ts = arg.ts; > + } else { > + sig = (const sigset_t __user *)argp; > + ts = NULL; > + } > + > f = fdget(fd); > if (!f.file) > return -EBADF; > @@ -8052,7 +8085,7 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, > !(ctx->flags & IORING_SETUP_SQPOLL)) { > ret = io_iopoll_check(ctx, min_complete); > } else { > - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); > + ret = io_cqring_wait(ctx, min_complete, sig, sigsz, ts); > } > } > > @@ -8346,7 +8379,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, > p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | > IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | > IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | > - IORING_FEAT_POLL_32BITS; > + IORING_FEAT_POLL_32BITS | IORING_FEAT_GETEVENTS_TIMEOUT; > > if (copy_to_user(params, p, sizeof(*p))) { > ret = -EFAULT; > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index d65fde7..70764d2 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -224,6 +224,7 @@ struct io_cqring_offsets { > */ > #define IORING_ENTER_GETEVENTS (1U << 0) > #define IORING_ENTER_SQ_WAKEUP (1U << 1) > +#define IORING_ENTER_GETEVENTS_TIMEOUT (1U << 2) > > /* > * Passed in for io_uring_setup(2). Copied back with updated info on success > @@ -251,6 +252,7 @@ struct io_uring_params { > #define IORING_FEAT_CUR_PERSONALITY (1U << 4) > #define IORING_FEAT_FAST_POLL (1U << 5) > #define IORING_FEAT_POLL_32BITS (1U << 6) > +#define IORING_FEAT_GETEVENTS_TIMEOUT (1U << 7) > > /* > * io_uring_register(2) opcodes and arguments > @@ -290,4 +292,9 @@ struct io_uring_probe { > struct io_uring_probe_op ops[0]; > }; > > +struct io_uring_getevents_arg { > + sigset_t *sigmask; > + struct __kernel_timespec *ts; > +}; > + > #endif >