It may be that certain (all?) network functions, like recv, should not be accounted as iowait. However, I don't think the onus should be on applications to tell the kernel about that--the kernel should just figure that out on its own. Am I alone in these opinions? Cheers, Jeff > > Signed-off-by: David Wei <dw@xxxxxxxxxxx> > --- > v2: > - squash patches into one > - move no_iowait in struct io_wait_queue to the end > - always set iowq.no_iowait > > --- > include/uapi/linux/io_uring.h | 2 ++ > io_uring/io_uring.c | 7 ++++--- > io_uring/io_uring.h | 1 + > 3 files changed, 7 insertions(+), 3 deletions(-) > > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index 48c440edf674..3a94afa8665e 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -508,6 +508,7 @@ struct io_cqring_offsets { > #define IORING_ENTER_EXT_ARG (1U << 3) > #define IORING_ENTER_REGISTERED_RING (1U << 4) > #define IORING_ENTER_ABS_TIMER (1U << 5) > +#define IORING_ENTER_NO_IOWAIT (1U << 6) > > /* > * Passed in for io_uring_setup(2). Copied back with updated info on success > @@ -543,6 +544,7 @@ struct io_uring_params { > #define IORING_FEAT_LINKED_FILE (1U << 12) > #define IORING_FEAT_REG_REG_RING (1U << 13) > #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) > +#define IORING_FEAT_IOWAIT_TOGGLE (1U << 15) > > /* > * io_uring_register(2) opcodes and arguments > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c > index 20229e72b65c..5e75672525df 100644 > --- a/io_uring/io_uring.c > +++ b/io_uring/io_uring.c > @@ -2372,7 +2372,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, > * can take into account that the task is waiting for IO - turns out > * to be important for low QD IO. > */ > - if (current_pending_io()) > + if (!iowq->no_iowait && current_pending_io()) > current->in_iowait = 1; > ret = 0; > if (iowq->timeout == KTIME_MAX) > @@ -2414,6 +2414,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, > iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); > iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; > iowq.timeout = KTIME_MAX; > + iowq.no_iowait = flags & IORING_ENTER_NO_IOWAIT; > > if (uts) { > struct timespec64 ts; > @@ -3155,7 +3156,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, > if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | > IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | > IORING_ENTER_REGISTERED_RING | > - IORING_ENTER_ABS_TIMER))) > + IORING_ENTER_ABS_TIMER | IORING_ENTER_NO_IOWAIT))) > return -EINVAL; > > /* > @@ -3539,7 +3540,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, > IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | > IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | > IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | > - IORING_FEAT_RECVSEND_BUNDLE; > + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_IOWAIT_TOGGLE; > > if (copy_to_user(params, p, sizeof(*p))) { > ret = -EFAULT; > diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h > index 9935819f12b7..426079a966ac 100644 > --- a/io_uring/io_uring.h > +++ b/io_uring/io_uring.h > @@ -46,6 +46,7 @@ struct io_wait_queue { > ktime_t napi_busy_poll_dt; > bool napi_prefer_busy_poll; > #endif > + bool no_iowait; > }; > > static inline bool io_should_wake(struct io_wait_queue *iowq)