On 9/20/19 3:26 PM, Jens Axboe wrote: > But sounds like we are in violent agreement. I'll post a new patch for > this soonish. How about this? You pass in number of events in sqe->off. If that amount of events happen before the timer expires, then the timer is deleted and the completion posted. The timeout cqe->res will be -ETIME if the timer expired, and 0 if it got removed due to hitting the number of events. Lightly tested, works for me. diff --git a/fs/io_uring.c b/fs/io_uring.c index 05a299e80159..88d4584f12cd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -200,6 +200,7 @@ struct io_ring_ctx { struct io_uring_sqe *sq_sqes; struct list_head defer_list; + struct list_head timeout_list; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -216,6 +217,7 @@ struct io_ring_ctx { struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; + atomic_t cq_timeouts; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -283,6 +285,12 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_timeout { + struct file *file; + unsigned count; + struct hrtimer timer; +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -294,6 +302,7 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_timeout timeout; }; struct sqe_submit submit; @@ -344,6 +353,8 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res); static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -400,6 +411,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); INIT_LIST_HEAD(&ctx->defer_list); + INIT_LIST_HEAD(&ctx->timeout_list); return ctx; } @@ -460,10 +472,40 @@ static inline void io_queue_async_work(struct io_ring_ctx *ctx, queue_work(ctx->sqo_wq[rw], &req->work); } +static void io_kill_timeout(struct io_kiocb *req) +{ + int ret; + + ret = hrtimer_try_to_cancel(&req->timeout.timer); + if (ret != -1) { + list_del(&req->list); + io_cqring_fill_event(req->ctx, req->user_data, 0); + __io_free_req(req); + } +} + +static void io_kill_timeouts(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req, *tmp; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + io_kill_timeout(req); + spin_unlock_irq(&ctx->completion_lock); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; + if (!list_empty(&ctx->timeout_list)) { + struct io_kiocb *tmp; + + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + if (!--req->timeout.count) + io_kill_timeout(req); + } + __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { @@ -1765,6 +1807,60 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) +{ + struct io_ring_ctx *ctx; + struct io_kiocb *req; + unsigned long flags; + + req = container_of(timer, struct io_kiocb, timeout.timer); + ctx = req->ctx; + atomic_inc(&ctx->cq_timeouts); + + spin_lock_irqsave(&ctx->completion_lock, flags); + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + + io_put_req(req); + return HRTIMER_NORESTART; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + struct timespec ts; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len != 1) + return -EINVAL; + if (copy_from_user(&ts, (void __user *) sqe->addr, sizeof(ts))) + return -EFAULT; + + /* + * sqe->off holds how many events that need to occur for this + * timeout event to be satisfied. + */ + req->timeout.count = READ_ONCE(sqe->off); + if (!req->timeout.count) + req->timeout.count = 1; + + spin_lock_irq(&ctx->completion_lock); + list_add_tail(&req->list, &ctx->timeout_list); + spin_unlock_irq(&ctx->completion_lock); + + hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + req->timeout.timer.function = io_timeout_fn; + hrtimer_start(&req->timeout.timer, timespec_to_ktime(ts), + HRTIMER_MODE_REL); + return 0; +} + static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1842,6 +1938,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_RECVMSG: ret = io_recvmsg(req, s->sqe, force_nonblock); break; + case IORING_OP_TIMEOUT: + ret = io_timeout(req, s->sqe); + break; default: ret = -EINVAL; break; @@ -2599,6 +2698,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { struct io_rings *rings = ctx->rings; + unsigned nr_timeouts; int ret; if (io_cqring_events(rings) >= min_events) @@ -2617,7 +2717,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); + nr_timeouts = atomic_read(&ctx->cq_timeouts); + /* + * Return if we have enough events, or if a timeout occured since + * we started waiting. For timeouts, we always want to return to + * userspace. + */ + ret = wait_event_interruptible(ctx->wait, + io_cqring_events(rings) >= min_events || + atomic_read(&ctx->cq_timeouts) != nr_timeouts); restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; @@ -3288,6 +3396,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_kill_timeouts(ctx); io_poll_remove_all(ctx); io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 96ee9d94b73e..cf3101dc6b1e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -61,6 +61,7 @@ struct io_uring_sqe { #define IORING_OP_SYNC_FILE_RANGE 8 #define IORING_OP_SENDMSG 9 #define IORING_OP_RECVMSG 10 +#define IORING_OP_TIMEOUT 11 /* * sqe->fsync_flags -- Jens Axboe