Re: [PATCH v5] Subject: io_uring: releasing CPU resources when polling

Pavel Begunkov <asml.silence@xxxxxxxxx> · Wed, 19 Jun 2024 16:22:17 +0100

On 6/19/24 08:18, hexue wrote:
io_uring use polling mode could improve the IO performence, but it will
spend 100% of CPU resources to do polling.

This set a signal "IORING_SETUP_HY_POLL" to application, aim to provide
a interface for user to enable a new hybrid polling at io_uring level.

A new hybrid poll is implemented on the io_uring layer. Once IO issued,
it will not polling immediately, but block first and re-run before IO
complete, then poll to reap IO. This poll function could keep polling
high performance and free up some CPU resources.

we considered about complex situations, such as multi-concurrency,
different processing speed of multi-disk, etc.

Test results:
set 8 poll queues, fio-3.35, Gen5 SSD, 8 CPU VM

per CPU utilization:
     read(128k, QD64, 1Job)     53%   write(128k, QD64, 1Job)     45%
     randread(4k, QD64, 16Job)  70%   randwrite(4k, QD64, 16Job)  16%
performance reduction:
     read  0.92%     write  0.92%    randread  1.61%    randwrite  0%

What are numbers for normal / non-IOPOLL runs?



--

changes since v4:
- Rewrote the commit
- Update the test results
- Reorganized the code basd on 6.11

changes since v3:
- Simplified the commit
- Add some comments on code

changes since v2:
- Modified some formatting errors
- Move judgement to poll path

changes since v1:
- Extend hybrid poll to async polled io

Signed-off-by: hexue <xue01.he@xxxxxxxxxxx>
---
  include/linux/io_uring_types.h |  14 ++++
  include/uapi/linux/io_uring.h  |   1 +
  io_uring/io_uring.c            |   4 +-
  io_uring/io_uring.h            |   3 +
  io_uring/rw.c                  | 115 ++++++++++++++++++++++++++++++++-
  5 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 91224bbcfa73..8eab99c4122e 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -226,6 +226,11 @@ struct io_alloc_cache {
  	size_t			elem_size;
  };
  
+struct iopoll_info {
+	long		last_runtime;
+	long		last_irqtime;
+};

Please follow the naming convention for all types you
add: "io_...".

+
  struct io_ring_ctx {
  	/* const or read-mostly hot data */
  	struct {
@@ -428,6 +433,7 @@ struct io_ring_ctx {
  	unsigned short			n_sqe_pages;
  	struct page			**ring_pages;
  	struct page			**sqe_pages;
+	struct xarray		poll_array;
  };
  
  struct io_tw_state {
@@ -591,6 +597,12 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz)
  )
  #define cmd_to_io_kiocb(ptr)	((struct io_kiocb *) ptr)
  
+struct hy_poll_time {
+	int		poll_state;
+	struct timespec64		iopoll_start;
+	struct timespec64		iopoll_end;

why not u64 and ktime_get_ns()?

+};
+
  struct io_kiocb {
  	union {
  		/*
@@ -665,6 +677,8 @@ struct io_kiocb {
  		u64			extra1;
  		u64			extra2;
  	} big_cqe;
+    /* for hybrid iopoll */
+	struct hy_poll_time		*hy_poll;
  };
  
  struct io_overflow_cqe {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 994bf7af0efe..ef32ec319d1f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -199,6 +199,7 @@ enum io_uring_sqe_flags_bit {
   * Removes indirection through the SQ index array.
   */
  #define IORING_SETUP_NO_SQARRAY		(1U << 16)
+#define IORING_SETUP_HY_POLL	(1U << 17)
  
  enum io_uring_op {
  	IORING_OP_NOP,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 816e93e7f949..a1015ce6dde7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -299,6 +299,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
  		goto err;
  
  	ctx->flags = p->flags;
+	xa_init(&ctx->poll_array);
  	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
  	init_waitqueue_head(&ctx->sqo_sq_wait);
  	INIT_LIST_HEAD(&ctx->sqd_list);
@@ -2679,6 +2680,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
  	kfree(ctx->cancel_table.hbs);
  	kfree(ctx->cancel_table_locked.hbs);
  	xa_destroy(&ctx->io_bl_xa);
+	xa_destroy(&ctx->poll_array);
  	kfree(ctx);
  }
  
@@ -3637,7 +3639,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
  			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
  			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
  			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HY_POLL))
  		return -EINVAL;
  
  	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 624ca9076a50..665093c048ba 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -148,6 +148,9 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
  		__io_submit_flush_completions(ctx);
  }
  
+/* if sleep time less than 1us, then do not do the schedule op */
+#define MIN_SCHETIME 1000

Add prefix IO_, and name should be less ambiguous.

e.g. IO_HYBRID_POLL_MIN_SLEEP

+
  #define io_for_each_link(pos, head) \
  	for (pos = (head); pos; pos = pos->link)
  
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1a2128459cb4..48b162becfa2 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -772,6 +772,46 @@ static bool need_complete_io(struct io_kiocb *req)
  		S_ISBLK(file_inode(req->file)->i_mode);
  }
  
+static void init_hybrid_poll(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	/*
+	 * In multiple concurrency, a thread may operate several files
+	 * under different file systems, the inode numbers may be
+	 * duplicated. Each device has a different IO command processing
+	 * capability, so using device number to record the running time
+	 * of device
+	 */
+	u32 index = req->file->f_inode->i_rdev;
+	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);
+	struct hy_poll_time *hpt = kmalloc(sizeof(struct hy_poll_time), GFP_KERNEL);

Just a note, it's a performance hazard for anything running
fast enough. Per IO allocation + xarray + ktime...

+
+	/* if alloc fail, go to regular poll */
+	if (!hpt) {
+		ctx->flags &= ~IORING_SETUP_HY_POLL;

Please don't modify ctx->flags, if you want to fall back
to normal polling do it per IO.

+		return;
+	}
+	hpt->poll_state = 0;

What's 0/1? Just turn it into a bool with the right name
or add named constants for states.

+	req->hy_poll = hpt;
+
+	if (!entry) {
+		entry = kmalloc(sizeof(struct iopoll_info), GFP_KERNEL);
+		if (!entry) {
+			ctx->flags &= ~IORING_SETUP_HY_POLL;
+			return;
+		}
+		entry->last_runtime = 0;
+		entry->last_irqtime = 0;
+		xa_store(&ctx->poll_array, index, entry, GFP_KERNEL);
+	}
+
+	/*
+	 * Here we need nanosecond timestamps, some ways of reading
+	 * timestamps directly are only accurate to microseconds, so
+	 * there's no better alternative here for now
+	 */
+	ktime_get_ts64(&hpt->iopoll_start);
+}
+
  static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
  {
  	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
@@ -809,6 +849,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
  		kiocb->ki_flags |= IOCB_HIPRI;
  		kiocb->ki_complete = io_complete_rw_iopoll;
  		req->iopoll_completed = 0;
+		if (ctx->flags & IORING_SETUP_HY_POLL)
+			init_hybrid_poll(ctx, req);
  	} else {
  		if (kiocb->ki_flags & IOCB_HIPRI)
  			return -EINVAL;
@@ -1106,6 +1148,75 @@ void io_rw_fail(struct io_kiocb *req)
  	io_req_set_res(req, res, req->cqe.flags);
  }
  
+static void io_delay(struct hy_poll_time *hpt, struct iopoll_info *entry)
+{
+	struct hrtimer_sleeper timer;
+	struct timespec64 tc, oldtc;
+	enum hrtimer_mode mode;
+	ktime_t kt;
+	long sleep_ti;
+
+	if (hpt->poll_state == 1)
+		return;
+
+	if (entry->last_runtime <= entry->last_irqtime)
+		return;
+
+	/*
+	 * Avoid excessive scheduling time affecting performance
+	 * by using only 25 per cent of the remaining time
+	 */
+	sleep_ti = (entry->last_runtime - entry->last_irqtime) / 4;

Let's say you've got a new entry, which sets both to 0. Then
user does sleep(8 min) and iopolls it once, which will update
last_runtime=8 min. Next time it tries to iopoll,
(8 min - 0) / 4 = 2 min. It'll try to sleep for 2 minutes,
with UNINTERRUBTIBLE (below) at that.

Extreme, but shows there can be all kinds of fun with this scheme.

+
+	/*
+	 * If the time available for sleep is too short, i.e. the
+	 * totle running time and the context switching loss time
+	 * are very close to each other, the scheduling operation
+	 * is not performed to avoid increasing latency
+	 */
+	if (sleep_ti < MIN_SCHETIME)
+		return;
+
+	ktime_get_ts64(&oldtc);
+	kt = ktime_set(0, sleep_ti);
+	hpt->poll_state = 1;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&timer.timer, kt);
+	set_current_state(TASK_UNINTERRUPTIBLE);

TASK_INTERRUPTIBLE, we don't want it to make userspace
unresponsive.

+	hrtimer_sleeper_start_expires(&timer, mode);
+
+	if (timer.task)
+		io_schedule();
+
+	hrtimer_cancel(&timer.timer);
+	mode = HRTIMER_MODE_ABS;

What's this for? It's a local variable.

+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&timer.timer);
+
+	ktime_get_ts64(&tc);
+	entry->last_irqtime = tc.tv_nsec - oldtc.tv_nsec - sleep_ti
Where did seconds go? And can also get negative values and all
kinds of inconsistent timings.

Why is it even called irq time? Anyone would think it's an interrupt
from the IO, but it's rather from the timer.

+}
+
+static int io_uring_hybrid_poll(struct io_kiocb *req,
+				struct io_comp_batch *iob, unsigned int poll_flags)
+{
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	struct io_ring_ctx *ctx = req->ctx;
+	struct hy_poll_time *hpt = req->hy_poll;
+	u32 index = req->file->f_inode->i_rdev;
+	struct iopoll_info *entry = xa_load(&ctx->poll_array, index);

Why is it per file/device/etc instead of globally per ring?
It'd make some sense if you caluculate an aggregate, e.g.
minimum sleep time for all of them, but otherwise not much.
Take one slow device, one fast device, while sleeping for the
slow one, you can get lots of completions from the fast one.

+	int ret;

Why it's even per inode but not per ring? Take one very
slow device,

+
+	io_delay(hpt, entry);
+	ret = req->file->f_op->iopoll(&rw->kiocb, iob, poll_flags);

IORING_OP_URING_CMD uses uring_cmd_iopoll

+
+	ktime_get_ts64(&hpt->iopoll_end);
+	entry->last_runtime = hpt->iopoll_end.tv_nsec - hpt->iopoll_start.tv_nsec;
+	return ret;
+}
+
  int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
  {
  	struct io_wq_work_node *pos, *start, *prev;
@@ -1133,7 +1244,9 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
  		if (READ_ONCE(req->iopoll_completed))
  			break;
  
-		if (req->opcode == IORING_OP_URING_CMD) {
+		if (ctx->flags & IORING_SETUP_HY_POLL) {
+			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+		} else if (req->opcode == IORING_OP_URING_CMD) {
  			struct io_uring_cmd *ioucmd;
  
  			ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);

--
Pavel Begunkov