The user might not care about getting results of certain request, but there will still wake up the task (i.e. task_work) and trigger the waiting loop to terminate. IOSQE_SET_F_HINT_SILENT attempts to de-priorities such completions. The completion will be eventually posted, however the execution of the request can and likely will be delayed to batch it with other requests. It's an incomplete prototype, it only works with DEFER_TASKRUN, fails to apply the optimisation for task_works queued before the waiting loop starts, and interaction with IOSQE_SET_F_HINT_IGNORE_INLINE is likely broken. Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx> --- include/uapi/linux/io_uring.h | 1 + io_uring/io_uring.c | 43 +++++++++++++++++++++++------------ io_uring/register.c | 3 ++- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e6d10fba8ae2..6dff0ee4e20c 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -901,6 +901,7 @@ struct io_uring_recvmsg_out { enum { IOSQE_SET_F_HINT_IGNORE_INLINE = 1, + IOSQE_SET_F_HINT_SILENT = 2, }; struct io_uring_ioset_reg { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6e89435c243d..2e1af10fd4f2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1270,6 +1270,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, { unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; + bool ignore = req->ioset->flags & IOSQE_SET_F_HINT_SILENT; /* See comment above IO_CQ_WAKE_INIT */ BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); @@ -1297,13 +1298,17 @@ static inline void io_req_local_work_add(struct io_kiocb *req, nr_tw_prev = READ_ONCE(first_req->nr_tw); } - /* - * Theoretically, it can overflow, but that's fine as one of - * previous adds should've tried to wake the task. - */ - nr_tw = nr_tw_prev + 1; - if (!(flags & IOU_F_TWQ_LAZY_WAKE)) - nr_tw = IO_CQ_WAKE_FORCE; + nr_tw = nr_tw_prev; + + if (!ignore) { + /* + * Theoretically, it can overflow, but that's fine as + * one of previous adds should've tried to wake the task. + */ + nr_tw += 1; + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) + nr_tw = IO_CQ_WAKE_FORCE; + } req->nr_tw = nr_tw; req->io_task_work.node.next = head; @@ -1325,6 +1330,9 @@ static inline void io_req_local_work_add(struct io_kiocb *req, io_eventfd_signal(ctx); } + if (ignore) + return; + nr_wait = atomic_read(&ctx->cq_wait_nr); /* not enough or no one is waiting */ if (nr_tw < nr_wait) @@ -1405,7 +1413,7 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, } static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, - int min_events) + int min_events, struct io_wait_queue *waitq) { struct llist_node *node; unsigned int loops = 0; @@ -1425,6 +1433,10 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); + + if (req->ioset->flags & IOSQE_SET_F_HINT_SILENT) + waitq->cq_tail++; + INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, req, ts); @@ -1450,16 +1462,17 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, if (llist_empty(&ctx->work_llist)) return 0; - return __io_run_local_work(ctx, &ts, min_events); + return __io_run_local_work(ctx, &ts, min_events, NULL); } -static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) +static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, + struct io_wait_queue *waitq) { struct io_tw_state ts = {}; int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events); + ret = __io_run_local_work(ctx, &ts, min_events, waitq); mutex_unlock(&ctx->uring_lock); return ret; } @@ -2643,7 +2656,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) { if (!llist_empty(&ctx->work_llist)) { __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx, INT_MAX) > 0) + if (io_run_local_work(ctx, INT_MAX, NULL) > 0) return 0; } if (io_run_task_work() > 0) @@ -2806,7 +2819,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (!io_allowed_run_tw(ctx)) return -EEXIST; if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx, min_events); + io_run_local_work(ctx, min_events, NULL); io_run_task_work(); if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) @@ -2877,7 +2890,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * now rather than let the caller do another wait loop. */ if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx, nr_wait); + io_run_local_work(ctx, nr_wait, &iowq); io_run_task_work(); /* @@ -3389,7 +3402,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx, INT_MAX) > 0; + ret |= io_run_local_work(ctx, INT_MAX, NULL) > 0; ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, tctx, cancel_all); diff --git a/io_uring/register.c b/io_uring/register.c index f87ec7b773bd..5462c49bebd3 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -92,7 +92,8 @@ static int io_update_ioset(struct io_ring_ctx *ctx, { if (!(ctx->flags & IORING_SETUP_IOSET)) return -EINVAL; - if (reg->flags & ~IOSQE_SET_F_HINT_IGNORE_INLINE) + if (reg->flags & ~(IOSQE_SET_F_HINT_IGNORE_INLINE | + IOSQE_SET_F_HINT_SILENT)) return -EINVAL; if (reg->__resv[0] || reg->__resv[1] || reg->__resv[2]) return -EINVAL; -- 2.46.0