As the normal aio wait path(read_events() -> wait_event_interruptible_hrtimeout()) doesn't account iowait time, so use this patch to make it to account iowait time, which can truely reflect the system io situation when using a tool like 'top'. The test result as below. Test environment: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian CPU(s): 32 On-line CPU(s) list: 0-31 Thread(s) per core: 2 Core(s) per socket: 8 Socket(s): 2 NUMA node(s): 2 Vendor ID: GenuineIntel CPU family: 6 Model: 85 Model name: Intel(R) Xeon(R) Silver 4108 CPU @ 1.80GHz Stepping: 4 CPU MHz: 801.660 AIO test command: fio -ioengine=libaio -bs=8k -direct=1 -numjobs 32 -rw=read -size=10G -filename=/dev/sda3 -name="Max throughput" -iodepth=128 -runtime=60 Before test, set nr_requests to 512(default is 256), aim to to make the backend device busy to handle io request, and make io_getevents() have more chances to wait for io completion: echo 512 > /sys/block/sda/queue/nr_requests Fio test result with the AIO iowait time accounting patch showed as below, almost all fio threads are in 'D' state to waiting for io completion, and the iowait time is accounted as 48.0%: top - 19:19:23 up 29 min, 2 users, load average: 14.60, 9.45, 9.45 Tasks: 456 total, 4 running, 247 sleeping, 0 stopped, 0 zombie %Cpu(s): 0.4 us, 1.0 sy, 0.0 ni, 50.6 id, 48.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem : 19668915+total, 19515264+free, 866476 used, 670028 buff/cache KiB Swap: 4194300 total, 4194300 free, 0 used. 19449948+avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 16135 root 20 0 294092 63724 63060 S 1.7 0.0 0:03.31 fio 16173 root 20 0 272352 3540 1792 D 1.7 0.0 0:03.85 fio 16175 root 20 0 272360 3544 1796 D 1.7 0.0 0:03.85 fio 16185 root 20 0 272400 3556 1808 D 1.7 0.0 0:03.84 fio 16187 root 20 0 272408 3552 1804 D 1.7 0.0 0:03.82 fio 16190 root 20 0 272420 3500 1804 R 1.7 0.0 0:03.88 fio 16169 root 20 0 272336 3444 1740 D 1.3 0.0 0:03.75 fio 16170 root 20 0 272340 3504 1804 R 1.3 0.0 0:03.80 fio 16172 root 20 0 272348 3500 1800 D 1.3 0.0 0:03.86 fio 16174 root 20 0 272356 3544 1796 D 1.3 0.0 0:03.77 fio 16179 root 20 0 272376 3528 1780 D 1.3 0.0 0:03.79 fio 16180 root 20 0 272380 3500 1800 D 1.3 0.0 0:03.85 fio 16181 root 20 0 272384 3552 1804 D 1.3 0.0 0:03.87 fio 16182 root 20 0 272388 3520 1772 D 1.3 0.0 0:03.80 fio 16183 root 20 0 272392 3552 1804 D 1.3 0.0 0:03.77 fio 16186 root 20 0 272404 3500 1804 D 1.3 0.0 0:03.88 fio 16188 root 20 0 272412 3500 1800 D 1.3 0.0 0:03.89 fio 16191 root 20 0 272424 3500 1800 D 1.3 0.0 0:03.92 fio 16192 root 20 0 272428 3500 1800 D 1.3 0.0 0:03.87 fio 16194 root 20 0 272436 3500 1804 D 1.3 0.0 0:03.82 fio 16195 root 20 0 272440 3500 1800 R 1.3 0.0 0:03.82 fio 16196 root 20 0 272444 3552 1804 D 1.3 0.0 0:03.84 fio 16198 root 20 0 272452 3500 1804 D 1.3 0.0 0:03.89 fio 16199 root 20 0 272456 3504 1800 D 1.3 0.0 0:03.84 fio 16200 root 20 0 272460 3552 1804 D 1.3 0.0 0:03.85 fio 16171 root 20 0 272344 3504 1800 D 1.0 0.0 0:03.84 fio 16176 root 20 0 272364 3520 1772 D 1.0 0.0 0:03.76 fio 16177 root 20 0 272368 3556 1808 D 1.0 0.0 0:03.74 fio 16178 root 20 0 272372 3500 1804 D 1.0 0.0 0:03.90 fio 16184 root 20 0 272396 3500 1800 D 1.0 0.0 0:03.83 fio 16189 root 20 0 272416 3500 1804 D 1.0 0.0 0:03.86 fio 16193 root 20 0 272432 3500 1804 D 1.0 0.0 0:03.85 fio 16197 root 20 0 272448 3556 1808 D 1.0 0.0 0:03.75 fio Fio test result without the AIO iowait time accounting patch showed as below, almost all fio threads are in 'S' state to waiting for io completion, and the iowait time is not accounted, iowait is 0.0%. top - 19:20:44 up 31 min, 2 users, load average: 12.50, 10.15, 9.72 Tasks: 458 total, 2 running, 249 sleeping, 0 stopped, 0 zombie %Cpu(s): 0.4 us, 0.9 sy, 0.0 ni, 98.6 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st KiB Mem : 19668915+total, 19513945+free, 879652 used, 670040 buff/cache KiB Swap: 4194300 total, 4194300 free, 0 used. 19448636+avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 16243 root 20 0 294092 63736 63068 S 1.7 0.0 0:03.06 fio 16277 root 20 0 272336 3568 1868 S 1.7 0.0 0:03.59 fio 16287 root 20 0 272376 3564 1864 S 1.7 0.0 0:03.64 fio 16291 root 20 0 272392 3620 1868 S 1.7 0.0 0:03.63 fio 16298 root 20 0 272420 3564 1868 S 1.7 0.0 0:03.61 fio 16302 root 20 0 272436 3560 1868 S 1.7 0.0 0:03.61 fio 16303 root 20 0 272440 3552 1800 S 1.7 0.0 0:03.62 fio 16308 root 20 0 272460 3568 1864 S 1.7 0.0 0:03.60 fio 16278 root 20 0 272340 3568 1868 S 1.3 0.0 0:03.59 fio 16279 root 20 0 272344 3508 1800 S 1.3 0.0 0:03.60 fio 16280 root 20 0 272348 3564 1864 S 1.3 0.0 0:03.60 fio 16281 root 20 0 272352 3624 1872 S 1.3 0.0 0:03.57 fio 16283 root 20 0 272360 3612 1860 S 1.3 0.0 0:03.60 fio 16285 root 20 0 272368 3592 1840 S 1.3 0.0 0:03.62 fio 16286 root 20 0 272372 3580 1828 S 1.3 0.0 0:03.61 fio 16288 root 20 0 272380 3620 1868 S 1.3 0.0 0:03.55 fio 16289 root 20 0 272384 3564 1868 S 1.3 0.0 0:03.59 fio 16292 root 20 0 272396 3536 1836 S 1.3 0.0 0:03.62 fio 16293 root 20 0 272400 3624 1872 S 1.3 0.0 0:03.63 fio 16295 root 20 0 272408 3620 1868 S 1.3 0.0 0:03.61 fio 16297 root 20 0 272416 3568 1868 S 1.3 0.0 0:03.62 fio 16300 root 20 0 272428 3564 1864 R 1.3 0.0 0:03.61 fio 16304 root 20 0 272444 3564 1864 S 1.3 0.0 0:03.59 fio 16305 root 20 0 272448 3456 1760 S 1.3 0.0 0:03.65 fio 16307 root 20 0 272456 3568 1864 S 1.3 0.0 0:03.64 fio 16282 root 20 0 272356 3556 1860 S 1.0 0.0 0:03.55 fio 16284 root 20 0 272364 3612 1860 S 1.0 0.0 0:03.57 fio 16290 root 20 0 272388 3616 1864 S 1.0 0.0 0:03.54 fio 16294 root 20 0 272404 3624 1872 S 1.0 0.0 0:03.60 fio 16296 root 20 0 272412 3564 1864 S 1.0 0.0 0:03.60 fio 16299 root 20 0 272424 3540 1840 S 1.0 0.0 0:03.62 fio 16301 root 20 0 272432 3568 1868 S 1.0 0.0 0:03.63 fio 16306 root 20 0 272452 3624 1872 S 1.0 0.0 0:03.60 fio Signed-off-by: Xianting Tian <tian.xianting@xxxxxxx> --- fs/aio.c | 2 +- include/linux/wait.h | 26 ++++++++++++++++++++++---- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 5736bff48..8d00548e0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1290,7 +1290,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, if (until == 0) aio_read_events(ctx, min_nr, nr, event, &ret); else - wait_event_interruptible_hrtimeout(ctx->wait, + io_wait_event_hrtimeout(ctx->wait, aio_read_events(ctx, min_nr, nr, event, &ret), until); return ret; diff --git a/include/linux/wait.h b/include/linux/wait.h index 898c890fc..fb5902a25 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -312,6 +312,13 @@ do { \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ io_schedule()) +#define __io_wait_event_hrtimeout(wq_head, condition, timeout) \ +({ \ + int __ret = 0; \ + __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ + TASK_UNINTERRUPTIBLE, io_schedule()); \ +}) + /* * io_wait_event() -- like wait_event() but with io_schedule() */ @@ -323,6 +330,15 @@ do { \ __io_wait_event(wq_head, condition); \ } while (0) + +#define io_wait_event_hrtimeout(wq_head, condition, timeout) \ +do { \ + might_sleep(); \ + if (condition) \ + break; \ + __io_wait_event_hrtimeout(wq_head, condition, timeout); \ +} while (0) + #define __wait_event_freezable(wq_head, condition) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ freezable_schedule()) @@ -500,7 +516,7 @@ do { \ __ret; \ }) -#define __wait_event_hrtimeout(wq_head, condition, timeout, state) \ +#define __wait_event_hrtimeout(wq_head, condition, timeout, state, cmd) \ ({ \ int __ret = 0; \ struct hrtimer_sleeper __t; \ @@ -517,7 +533,7 @@ do { \ __ret = -ETIME; \ break; \ } \ - schedule()); \ + cmd); \ \ hrtimer_cancel(&__t.timer); \ destroy_hrtimer_on_stack(&__t.timer); \ @@ -546,7 +562,8 @@ do { \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq_head, condition, timeout, \ - TASK_UNINTERRUPTIBLE); \ + TASK_UNINTERRUPTIBLE, \ + schedule()); \ __ret; \ }) @@ -572,7 +589,8 @@ do { \ might_sleep(); \ if (!(condition)) \ __ret = __wait_event_hrtimeout(wq, condition, timeout, \ - TASK_INTERRUPTIBLE); \ + TASK_INTERRUPTIBLE, \ + schedule()); \ __ret; \ }) -- 2.17.1