On 3/1/20 9:18 PM, Xiaoguang Wang wrote: > When user applis programming mode, like sumbit one sqe and wait its > completion event, __io_uring_get_cqe() will result in many unnecessary > syscalls, see below test program: > > int main(int argc, char *argv[]) > { > struct io_uring ring; > int fd, ret; > struct io_uring_sqe *sqe; > struct io_uring_cqe *cqe; > struct iovec iov; > off_t offset, filesize = 0; > void *buf; > > if (argc < 2) { > printf("%s: file\n", argv[0]); > return 1; > } > > ret = io_uring_queue_init(4, &ring, 0); > if (ret < 0) { > fprintf(stderr, "queue_init: %s\n", strerror(-ret)); > return 1; > } > > fd = open(argv[1], O_RDONLY | O_DIRECT); > if (fd < 0) { > perror("open"); > return 1; > } > > if (posix_memalign(&buf, 4096, 4096)) > return 1; > iov.iov_base = buf; > iov.iov_len = 4096; > > offset = 0; > do { > sqe = io_uring_get_sqe(&ring); > if (!sqe) { > printf("here\n"); > break; > } > io_uring_prep_readv(sqe, fd, &iov, 1, offset); > > ret = io_uring_submit(&ring); > if (ret < 0) { > fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret)); > return 1; > } > > ret = io_uring_wait_cqe(&ring, &cqe); > if (ret < 0) { > fprintf(stderr, "io_uring_wait_cqe: %s\n", strerror(-ret)); > return 1; > } > > if (cqe->res <= 0) { > if (cqe->res < 0) { > fprintf(stderr, "got eror: %d\n", cqe->res); > ret = 1; > } > io_uring_cqe_seen(&ring, cqe); > break; > } > offset += cqe->res; > filesize += cqe->res; > io_uring_cqe_seen(&ring, cqe); > } while (1); > > printf("filesize: %ld\n", filesize); > close(fd); > io_uring_queue_exit(&ring); > return 0; > } > > dd if=/dev/zero of=testfile bs=4096 count=16 > ./test testfile > and use bpftrace to trace io_uring_enter syscalls, in original codes, > [lege@localhost ~]$ sudo bpftrace -e "tracepoint:syscalls:sys_enter_io_uring_enter {@c[tid] = count();}" > Attaching 1 probe... > @c[11184]: 49 > Above test issues 49 syscalls, it's counterintuitive. After looking > into the codes, it's because __io_uring_get_cqe issue one more syscall, > indded when __io_uring_get_cqe issues the first syscall, one cqe should > already be ready, we don't need to wait again. > > To fix this issue, after the first syscall, set wait_nr to be zero, with > tihs patch, bpftrace shows the number of io_uring_enter syscall is 33. Thanks, that's a nice fix, we definitely don't want to be doing 50% more system calls than we have to... -- Jens Axboe