By default, io_uring marks a waiting task as being in iowait, if it's sleeping waiting on events and there are pending requests. This isn't necessarily always useful, and may be confusing on non-storage setups where iowait isn't expected. It can also cause extra power usage, by preventing the CPU from entering lower sleep states. Add a sysctl knob to control this, /proc/sys/kernel/io_uring_iowait. It defaults to '1' which is the current behavior, and can be set to 0 if iowait accounting and boosting isn't deemed suitable on that system. Implemented as an int proc variable rather than a bool, in case there's a need to expand this in the future to distinguish between iowait accounting and cpufreq boosting. Bool proc entries do allow > 1 values without erroring, let's retain those for when we may actually use them. In the future, enter flags may be added to control this as well. For now, a system-wide knob is enough. Signed-off-by: Jens Axboe <axboe@xxxxxxxxx> --- Changes since v2: - Make it a sysctl knob instead, leaving per-ring enter flags as a future kind of thing. diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 58003fa6b327..2866ab55a739 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -156,9 +156,10 @@ static struct workqueue_struct *iou_wq __ro_after_init; static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; +static int __read_mostly sysctl_io_uring_iowait = 1; #ifdef CONFIG_SYSCTL -static const struct ctl_table kernel_io_uring_disabled_table[] = { +static const struct ctl_table kernel_io_uring_sysctl_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, @@ -175,6 +176,15 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "io_uring_iowait", + .data = &sysctl_io_uring_iowait, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; #endif @@ -2496,7 +2506,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ - if (current_pending_io()) + if (sysctl_io_uring_iowait && current_pending_io()) current->in_iowait = 1; if (iowq->timeout != KTIME_MAX || iowq->min_timeout) ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); @@ -3959,7 +3969,7 @@ static int __init io_uring_init(void) BUG_ON(!iou_wq); #ifdef CONFIG_SYSCTL - register_sysctl_init("kernel", kernel_io_uring_disabled_table); + register_sysctl_init("kernel", kernel_io_uring_sysctl_table); #endif return 0; -- Jens Axboe