Re: [PATCH v1 1/4] io_uring: only account cqring wait time as iowait if enabled for a ring

Jens Axboe <axboe@xxxxxxxxx> · Sun, 25 Feb 2024 14:11:10 -0700

On 2/25/24 9:43 AM, Jens Axboe wrote:
> If you are motivated, please dig into it. If not, I guess I will take a
> look this week. 

The straight forward approach - add a nr_short_wait and ->in_short_wait
and ensure that the idle governor factors that in. Not sure how
palatable it is, would be nice fold iowait under this, but doesn't
really work with how we pass back the previous state.

diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index b96e3da0fedd..39f05d6062e1 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -119,7 +119,7 @@ struct menu_device {
 	int		interval_ptr;
 };
 
-static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
+static inline int which_bucket(u64 duration_ns, unsigned int nr_short_waiters)
 {
 	int bucket = 0;
 
@@ -129,7 +129,7 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
 	 * This allows us to calculate
 	 * E(duration)|iowait
 	 */
-	if (nr_iowaiters)
+	if (nr_short_waiters)
 		bucket = BUCKETS/2;
 
 	if (duration_ns < 10ULL * NSEC_PER_USEC)
@@ -152,10 +152,10 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
  * to be, the higher this multiplier, and thus the higher
  * the barrier to go to an expensive C state.
  */
-static inline int performance_multiplier(unsigned int nr_iowaiters)
+static inline int performance_multiplier(unsigned int nr_short_waiters)
 {
 	/* for IO wait tasks (per cpu!) we add 10x each */
-	return 1 + 10 * nr_iowaiters;
+	return 1 + 10 * nr_short_waiters;
 }
 
 static DEFINE_PER_CPU(struct menu_device, menu_devices);
@@ -266,7 +266,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
 	u64 predicted_ns;
 	u64 interactivity_req;
-	unsigned int nr_iowaiters;
+	unsigned int nr_short_waiters;
 	ktime_t delta, delta_tick;
 	int i, idx;
 
@@ -275,7 +275,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		data->needs_update = 0;
 	}
 
-	nr_iowaiters = nr_iowait_cpu(dev->cpu);
+	nr_short_waiters = nr_iowait_cpu(dev->cpu) + nr_short_wait_cpu(dev->cpu);
 
 	/* Find the shortest expected idle interval. */
 	predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
@@ -290,7 +290,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		}
 
 		data->next_timer_ns = delta;
-		data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
+		data->bucket = which_bucket(data->next_timer_ns, nr_short_waiters);
 
 		/* Round up the result for half microseconds. */
 		timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
@@ -308,7 +308,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 */
 		data->next_timer_ns = KTIME_MAX;
 		delta_tick = TICK_NSEC / 2;
-		data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
+		data->bucket = which_bucket(KTIME_MAX, nr_short_waiters);
 	}
 
 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
@@ -341,7 +341,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 		 * latency_req to determine the maximum exit latency.
 		 */
 		interactivity_req = div64_u64(predicted_ns,
-					      performance_multiplier(nr_iowaiters));
+					      performance_multiplier(nr_short_waiters));
 		if (latency_req > interactivity_req)
 			latency_req = interactivity_req;
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..b04c08cadf1f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -923,6 +923,7 @@ struct task_struct {
 	/* Bit to tell TOMOYO we're in execve(): */
 	unsigned			in_execve:1;
 	unsigned			in_iowait:1;
+	unsigned			in_short_wait:1;
 #ifndef TIF_RESTORE_SIGMASK
 	unsigned			restore_sigmask:1;
 #endif
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 0108a38bb64d..12f5795c4c32 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -21,6 +21,7 @@ extern unsigned int nr_running(void);
 extern bool single_task_running(void);
 extern unsigned int nr_iowait(void);
 extern unsigned int nr_iowait_cpu(int cpu);
+unsigned int nr_short_wait_cpu(int cpu);
 
 static inline int sched_info_on(void)
 {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f6332fc56bed..024af44ead12 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2519,7 +2519,7 @@ static bool current_pending_io(void)
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq)
 {
-	int io_wait, ret;
+	int short_wait, ret;
 
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
@@ -2537,15 +2537,15 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 	 * can take into account that the task is waiting for IO - turns out
 	 * to be important for low QD IO.
 	 */
-	io_wait = current->in_iowait;
+	short_wait = current->in_short_wait;
 	if (current_pending_io())
-		current->in_iowait = 1;
+		current->in_short_wait = 1;
 	ret = 0;
 	if (iowq->timeout == KTIME_MAX)
 		schedule();
 	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
 		ret = -ETIME;
-	current->in_iowait = io_wait;
+	current->in_short_wait = short_wait;
 	return ret;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..dc7a08db8921 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3790,6 +3790,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 	if (p->in_iowait) {
 		delayacct_blkio_end(p);
 		atomic_dec(&task_rq(p)->nr_iowait);
+	} else if (p->in_short_wait) {
+		atomic_dec(&task_rq(p)->nr_short_wait);
 	}
 
 	activate_task(rq, p, en_flags);
@@ -4356,6 +4358,8 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 			if (p->in_iowait) {
 				delayacct_blkio_end(p);
 				atomic_dec(&task_rq(p)->nr_iowait);
+			} else if (p->in_short_wait) {
+				atomic_dec(&task_rq(p)->nr_short_wait);
 			}
 
 			wake_flags |= WF_MIGRATED;
@@ -5506,6 +5510,11 @@ unsigned int nr_iowait(void)
 	return sum;
 }
 
+unsigned int nr_short_wait_cpu(int cpu)
+{
+	return atomic_read(&cpu_rq(cpu)->nr_short_wait);
+}
+
 #ifdef CONFIG_SMP
 
 /*
@@ -6683,6 +6692,8 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 			if (prev->in_iowait) {
 				atomic_inc(&rq->nr_iowait);
 				delayacct_blkio_start();
+			} else if (prev->in_short_wait) {
+				atomic_inc(&rq->nr_short_wait);
 			}
 		}
 		switch_count = &prev->nvcsw;
@@ -10030,6 +10041,7 @@ void __init sched_init(void)
 #endif /* CONFIG_SMP */
 		hrtick_rq_init(rq);
 		atomic_set(&rq->nr_iowait, 0);
+		atomic_set(&rq->nr_short_wait, 0);
 
 #ifdef CONFIG_SCHED_CORE
 		rq->core = rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 533547e3c90a..9d4fc0b9de26 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6721,7 +6721,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	 * utilization updates, so do it here explicitly with the IOWAIT flag
 	 * passed.
 	 */
-	if (p->in_iowait)
+	if (p->in_iowait || p->in_short_wait)
 		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 
 	for_each_sched_entity(se) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001fe047bd5d..0530e9e97ecb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1050,6 +1050,7 @@ struct rq {
 #endif
 
 	atomic_t		nr_iowait;
+	atomic_t		nr_short_wait;
 
 #ifdef CONFIG_SCHED_DEBUG
 	u64 last_seen_need_resched_ns;

-- 
Jens Axboe