hi, This issue was found when I tested IORING_FEAT_FAST_POLL feature, with the newest upstream codes, indeed I find that io_uring's performace improvement is not obvious compared to epoll in my test environment, most of the time they are similar. Test cases basically comes from: https://github.com/frevib/io_uring-echo-server/blob/io-uring-feat-fast-poll/benchmarks/benchmarks.md. In above url, the author's test results shows that io_uring will get a big performace improvement compared to epoll. I'm still looking into why I don't get the big improvement, currently don't know why, but I find some obvious regression issue. I wrote a simple tool based io_uring nop operation to evaluate io_uring framework in v5.1 and 5.7.0-rc4+(jens's io_uring-5.7 branch), I see a obvious performace regression: v5.1 kernel: $sudo taskset -c 60 ./io_uring_nop_stress -r 300 # run 300 seconds total ios: 1832524960 IOPS: 6108416 5.7.0-rc4+ $sudo taskset -c 60 ./io_uring_nop_stress -r 300 total ios: 1597672304 IOPS: 5325574 it's about 12% performance regression. Using perf can see many performance bottlenecks, for example, io_submit_sqes is one. For now, I did't make many analysis yet, just have a look at io_submit_sqes(), there are many assignment operations in io_init_req(), but I'm not sure whether they are all needed when req is not needed to be punt to io-wq, for example, INIT_IO_WORK(&req->work, io_wq_submit_work); # a whole struct assignment from perf annotate tool, it's an expensive operation, I think reqs that use fast poll feature use task-work function, so the INIT_IO_WORK maybe not necessary. Above is just one issue, what I worry is that whether io_uring is becoming more bloated gradually, and will not that better to aio. In https://kernel.dk/io_uring.pdf, it says that io_uring will eliminate 104 bytes copy compared to aio, but see currenct io_init_req(), io_uring maybe copy more, introducing more overhead? Or does we need to carefully re-design struct io_kiocb, to reduce overhead as soon as possible. Regards, Xiaoguang Wang
/* * Description: run various io_uring nop tests * xiaoguang.wang@xxxxxxxxxxxxxxxxx * */ #include <errno.h> #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <string.h> #include <fcntl.h> #include "liburing.h" static char *myprog; static int batch_count = 16; static int force_async = 0; static int runtime = 30; static unsigned long long ios; static volatile int stop = 0; static int test_nop(struct io_uring *ring) { struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; int i, ret; for (i = 0; i < batch_count; i++) { sqe = io_uring_get_sqe(ring); if (!sqe) { fprintf(stderr, "get sqe failed\n"); goto err; } io_uring_prep_nop(sqe); if (force_async) sqe->flags |= IOSQE_ASYNC; } ret = io_uring_submit(ring); if (ret <= 0) { fprintf(stderr, "sqe submit failed: %d\n", ret); goto err; } for (i = 0; i < batch_count; i++) { ret = io_uring_wait_cqe(ring, &cqe); if (ret < 0) { fprintf(stderr, "wait completion %d\n", ret); goto err; } io_uring_cqe_seen(ring, cqe); } ios += batch_count; return 0; err: return 1; } static void usage(void) { printf("Usage: %s -H or\n", myprog); printf(" %s [-b batch][-a][-r runtime]\n", myprog); printf(" -b batch submission batch count, default 16\n"); printf(" -r runtime run time, default 30\n"); printf(" -a force asynchronous submission\n"); printf(" -H prints usage and exits\n"); } static void alarm_handler(int signum) { (void)signum; stop = 1; } int main(int argc, char *argv[]) { struct io_uring ring; struct sigaction sa; int ret, c; const char *opts = "b:ar:"; myprog = argv[0]; while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { case 'b': batch_count = atoi(optarg); break; case 'a': force_async = 1; break; case 'r': runtime = atoi(optarg); break; case 'H': usage(); exit(1); } } if (!batch_count) { fprintf(stderr, "batch count should be greater than 0\n"); exit(1); } if (!runtime) { printf("run time is zero, are you sure?\n"); return 0; } memset(&sa, 0, sizeof(sa)); sa.sa_handler = alarm_handler; sigemptyset(&sa.sa_mask); ret = sigaction(SIGALRM, &sa, NULL); if (ret < 0) { fprintf(stderr, "sigaction failed: %s", strerror(errno)); exit(1); } alarm(runtime); ret = io_uring_queue_init(batch_count, &ring, 0); if (ret) { fprintf(stderr, "ring setup failed: %d\n", ret); return 1; } while (!stop) { ret = test_nop(&ring); if (ret) { fprintf(stderr, "test_nop failed\n"); return ret; } } printf("total ios: %llu\n", ios); printf("IOPS: %llu\n", ios / runtime); return 0; }