io_uring use polling mode could improve the IO performence, but it will spend 100% of CPU resources to do polling. This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide a interface for user to enable a new hybrid polling at io_uring level. A new hybrid poll is implemented on the io_uring layer. Once IO issued, it will not polling immediately, but block first and re-run before IO complete, then poll to reap IO. This poll function could keep polling high performance and free up some CPU resources. we considered about complex situations, such as multi-concurrency, different processing speed of multi-disk, etc. Test results: set 8 poll queues, fio-3.35, Gen5 SSD, 8 CPU VM per CPU utilization: read(128k, QD64, 1Job) 53% write(128k, QD64, 1Job) 45% randread(4k, QD64, 16Job) 70% randwrite(4k, QD64, 16Job) 16% performance reduction: read 0.92% write 0.92% randread 1.61% randwrite 0% -- changes since v4: - Rewrote the commit - Update the test results - Reorganized the code basd on 6.11 changes since v3: - Simplified the commit - Add some comments on code changes since v2: - Modified some formatting errors - Move judgement to poll path changes since v1: - Extend hybrid poll to async polled io Signed-off-by: hexue <xue01.he@xxxxxxxxxxx> --- include/linux/io_uring_types.h | 14 ++++ include/uapi/linux/io_uring.h | 1 + io_uring/io_uring.c | 4 +- io_uring/io_uring.h | 3 + io_uring/rw.c | 115 ++++++++++++++++++++++++++++++++- 5 files changed, 135 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 91224bbcfa73..8eab99c4122e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -226,6 +226,11 @@ struct io_alloc_cache { size_t elem_size; }; +struct iopoll_info { + long last_runtime; + long last_irqtime; +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -428,6 +433,7 @@ struct io_ring_ctx { unsigned short n_sqe_pages; struct page **ring_pages; struct page **sqe_pages; + struct xarray poll_array; }; struct io_tw_state { @@ -591,6 +597,12 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) ) #define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) +struct hy_poll_time { + int poll_state; + struct timespec64 iopoll_start; + struct timespec64 iopoll_end; +}; + struct io_kiocb { union { /* @@ -665,6 +677,8 @@ struct io_kiocb { u64 extra1; u64 extra2; } big_cqe; + /* for hybrid iopoll */ + struct hy_poll_time *hy_poll; }; struct io_overflow_cqe { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 994bf7af0efe..ef32ec319d1f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -199,6 +199,7 @@ enum io_uring_sqe_flags_bit { * Removes indirection through the SQ index array. */ #define IORING_SETUP_NO_SQARRAY (1U << 16) +#define IORING_SETUP_HY_POLL (1U << 17) enum io_uring_op { IORING_OP_NOP, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 816e93e7f949..a1015ce6dde7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -299,6 +299,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; + xa_init(&ctx->poll_array); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); @@ -2679,6 +2680,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); xa_destroy(&ctx->io_bl_xa); + xa_destroy(&ctx->poll_array); kfree(ctx); } @@ -3637,7 +3639,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 624ca9076a50..665093c048ba 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -148,6 +148,9 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) __io_submit_flush_completions(ctx); } +/* if sleep time less than 1us, then do not do the schedule op */ +#define MIN_SCHETIME 1000 + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a2128459cb4..48b162becfa2 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -772,6 +772,46 @@ static bool need_complete_io(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } +static void init_hybrid_poll(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + /* + * In multiple concurrency, a thread may operate several files + * under different file systems, the inode numbers may be + * duplicated. Each device has a different IO command processing + * capability, so using device number to record the running time + * of device + */ + u32 index = req->file->f_inode->i_rdev; + struct iopoll_info *entry = xa_load(&ctx->poll_array, index); + struct hy_poll_time *hpt = kmalloc(sizeof(struct hy_poll_time), GFP_KERNEL); + + /* if alloc fail, go to regular poll */ + if (!hpt) { + ctx->flags &= ~IORING_SETUP_HY_POLL; + return; + } + hpt->poll_state = 0; + req->hy_poll = hpt; + + if (!entry) { + entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL); + if (!entry) { + ctx->flags &= ~IORING_SETUP_HY_POLL; + return; + } + entry->last_runtime = 0; + entry->last_irqtime = 0; + xa_store(&ctx->poll_array, index, entry, GFP_KERNEL); + } + + /* + * Here we need nanosecond timestamps, some ways of reading + * timestamps directly are only accurate to microseconds, so + * there's no better alternative here for now + */ + ktime_get_ts64(&hpt->iopoll_start); +} + static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); @@ -809,6 +849,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HY_POLL) + init_hybrid_poll(ctx, req); } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -1106,6 +1148,75 @@ void io_rw_fail(struct io_kiocb *req) io_req_set_res(req, res, req->cqe.flags); } +static void io_delay(struct hy_poll_time *hpt, struct iopoll_info *entry) +{ + struct hrtimer_sleeper timer; + struct timespec64 tc, oldtc; + enum hrtimer_mode mode; + ktime_t kt; + long sleep_ti; + + if (hpt->poll_state == 1) + return; + + if (entry->last_runtime <= entry->last_irqtime) + return; + + /* + * Avoid excessive scheduling time affecting performance + * by using only 25 per cent of the remaining time + */ + sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4; + + /* + * If the time available for sleep is too short, i.e. the + * totle running time and the context switching loss time + * are very close to each other, the scheduling operation + * is not performed to avoid increasing latency + */ + if (sleep_ti < MIN_SCHETIME) + return; + + ktime_get_ts64(&oldtc); + kt = ktime_set(0, sleep_ti); + hpt->poll_state = 1; + + mode = HRTIMER_MODE_REL; + hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode); + hrtimer_set_expires(&timer.timer, kt); + set_current_state(TASK_UNINTERRUPTIBLE); + hrtimer_sleeper_start_expires(&timer, mode); + + if (timer.task) + io_schedule(); + + hrtimer_cancel(&timer.timer); + mode = HRTIMER_MODE_ABS; + __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&timer.timer); + + ktime_get_ts64(&tc); + entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti; +} + +static int io_uring_hybrid_poll(struct io_kiocb *req, + struct io_comp_batch *iob, unsigned int poll_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_ring_ctx *ctx = req->ctx; + struct hy_poll_time *hpt = req->hy_poll; + u32 index = req->file->f_inode->i_rdev; + struct iopoll_info *entry = xa_load(&ctx->poll_array, index); + int ret; + + io_delay(hpt, entry); + ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags); + + ktime_get_ts64(&hpt->iopoll_end); + entry->last_runtime = hpt->iopoll_end.tv_nsec - hpt->iopoll_start.tv_nsec; + return ret; +} + int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; @@ -1133,7 +1244,9 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (READ_ONCE(req->iopoll_completed)) break; - if (req->opcode == IORING_OP_URING_CMD) { + if (ctx->flags & IORING_SETUP_HY_POLL) { + ret = io_uring_hybrid_poll(req, &iob, poll_flags); + } else if (req->opcode == IORING_OP_URING_CMD) { struct io_uring_cmd *ioucmd; ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); -- 2.40.1