sqes are submitted by sqthread when it is leveraged, which means there is IO latency when waking up sqthread. To wipe it out, submit limited number of sqes in the original task context. Tests result below: 99th latency: iops\idle 10us 60us 110us 160us 210us 260us 310us 360us 410us 460us 510us with this patch: 2k 13 13 12 13 13 12 12 11 11 10.304 11.84 without this patch: 2k 15 14 15 15 15 14 15 14 14 13 11.84 fio config: ./run_fio.sh fio \ --ioengine=io_uring --sqthread_poll=1 --hipri=1 --thread=1 --bs=4k \ --direct=1 --rw=randread --time_based=1 --runtime=300 \ --group_reporting=1 --filename=/dev/nvme1n1 --sqthread_poll_cpu=30 \ --randrepeat=0 --cpus_allowed=35 --iodepth=128 --rate_iops=${1} \ --io_sq_thread_idle=${2} Signed-off-by: Hao Xu <haoxu@xxxxxxxxxxxxxxxxx> --- fs/io_uring.c | 29 +++++++++++++++++++++++------ include/uapi/linux/io_uring.h | 1 + 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1871fad48412..f0a01232671e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1252,7 +1252,12 @@ static void io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = NULL; + + if (ctx->sq_data && ctx->sq_data->thread) + tctx = ctx->sq_data->thread->io_uring; + else + tctx = req->task->io_uring; BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -9063,9 +9068,10 @@ static void io_uring_try_cancel(struct files_struct *files) xa_for_each(&tctx->xa, index, node) { struct io_ring_ctx *ctx = node->ctx; - /* sqpoll task will cancel all its requests */ - if (!ctx->sq_data) - io_uring_try_cancel_requests(ctx, current, files); + /* + * for sqpoll ctx, there may be requests in task_works etc. + */ + io_uring_try_cancel_requests(ctx, current, files); } } @@ -9271,7 +9277,8 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz io_run_task_work(); if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) + IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | + IORING_ENTER_SQ_DEPUTY))) return -EINVAL; f = fdget(fd); @@ -9304,8 +9311,18 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz if (unlikely(ctx->sq_data->thread == NULL)) { goto out; } - if (flags & IORING_ENTER_SQ_WAKEUP) + if (flags & IORING_ENTER_SQ_WAKEUP) { wake_up(&ctx->sq_data->wait); + if ((flags & IORING_ENTER_SQ_DEPUTY) && + !(ctx->flags & IORING_SETUP_IOPOLL)) { + ret = io_uring_add_task_file(ctx); + if (unlikely(ret)) + goto out; + mutex_lock(&ctx->uring_lock); + io_submit_sqes(ctx, min(to_submit, 8U)); + mutex_unlock(&ctx->uring_lock); + } + } if (flags & IORING_ENTER_SQ_WAIT) { ret = io_sqpoll_wait_sq(ctx); if (ret) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 311532ff6ce3..b1130fec2b7d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -251,6 +251,7 @@ struct io_cqring_offsets { #define IORING_ENTER_SQ_WAKEUP (1U << 1) #define IORING_ENTER_SQ_WAIT (1U << 2) #define IORING_ENTER_EXT_ARG (1U << 3) +#define IORING_ENTER_SQ_DEPUTY (1U << 4) /* * Passed in for io_uring_setup(2). Copied back with updated info on success -- 1.8.3.1