Dominique MARTINET wrote on Wed, Jun 29, 2022 at 09:35:44AM +0900: > I also agree writing a simple program like the io_uring test in the > above commit that'd sort of do it like qemu and compare contents would > be ideal. > I'll have a stab at this today. Okay, after half a day failing to reproduce I had a closer look at qemu and... it's a qemu bug. Well, there probably are two bugs, but one should be benign: - qemu short read handling was... rather disappointing. Patch should appear here[1] eventually, but as it seems moderated? I'm reposting it here: ----- diff --git a/block/io_uring.c b/block/io_uring.c index d48e472e74cb..d58aff9615ce 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -103,7 +103,7 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb, remaining); /* Update sqe */ - luringcb->sqeq.off = nread; + luringcb->sqeq.off += nread; luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov; luringcb->sqeq.len = luringcb->resubmit_qiov.niov; ----- (basically "just" a typo, but that must have never been tested!) [1] https://lore.kernel.org/qemu-devel/20220629044957.1998430-1-dominique.martinet@xxxxxxxxxxxxxxxxx - comments there also say short reads should never happen on newer kernels (assuming local filesystems?) -- how true is that? If we're doing our best kernel side to avoid short reads I guess we probably ought to have a look at this. It can easily be reproduced with a simple io_uring program -- see example attached that eventually fails with the following error on btrfs: bad read result for io 8, offset 792227840: 266240 should be 1466368 but doesn't fail on tmpfs or without O_DIRECT. feel free to butcher it, it's already a quickly hacked downversion of my original test that had hash computation etc so the flow might feel a bit weird. Just compile with `gcc -o shortreads uring_shortreads.c -luring` and run with file to read in argument. Thanks! -- Dominique
/* Get O_DIRECT */ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <fcntl.h> #include <errno.h> #include <string.h> #include <liburing.h> #include <sys/random.h> #include <sys/stat.h> long pagesize; size_t n_blocks; #define QUEUE_SIZE 10 char *buffers[QUEUE_SIZE]; int bufsize[QUEUE_SIZE]; struct iovec iovec[QUEUE_SIZE]; long int offsets[QUEUE_SIZE]; void breakme(void) { } int submit_read(struct io_uring *ring, int fd, int i) { struct io_uring_sqe *sqe; int ret; sqe = io_uring_get_sqe(ring); if (!sqe) { fprintf(stderr, "Failed to get io_uring sqe\n"); return 1; } if (i == 0 || rand() % 2 == 0 || offsets[i-1] > n_blocks - bufsize[i]) { offsets[i] = rand() % (n_blocks - bufsize[i] + 1); } else { offsets[i] = offsets[i - 1]; } io_uring_prep_readv(sqe, fd, iovec + i, 1, offsets[i] * pagesize); io_uring_sqe_set_data(sqe, (void*)(uintptr_t)i); ret = io_uring_submit(ring); if (ret != 1) { fprintf(stderr, "submit failed\n"); return 1; } return 0; } int getsize(int fd) { struct stat sb; if (fstat(fd, &sb)) { fprintf(stderr, "fstat: %m\n"); return 1; } n_blocks = sb.st_size / pagesize; return 0; } int main(int argc, char *argv[]) { char *file, *mapfile; unsigned int seed; struct io_uring ring; struct io_uring_cqe *cqe; int fd, i; ssize_t ret; size_t total = 0; if (argc < 2 || argc > 3) { fprintf(stderr, "Use: %s <file> [<seed>]\n", argv[0]); return 1; } file = argv[1]; if (argc == 3) { seed = atol(argv[2]); } else { getrandom(&seed, sizeof(seed), 0); } printf("random seed %u\n", seed); srand(seed); pagesize = sysconf(_SC_PAGE_SIZE); if (asprintf(&mapfile, "%s.map", file) < 0) { fprintf(stderr, "asprintf map %d\n", errno); return 1; } fd = open(file, O_RDONLY | O_DIRECT); if (fd == -1) { fprintf(stderr, "Failed to open file '%s': %s (errno %d)\n", file, strerror(errno), errno); return 1; } if (getsize(fd)) return 1; for (i = 0 ; i < QUEUE_SIZE; i++) { bufsize[i] = (rand() % 1024) + 1; ret = posix_memalign((void**)&buffers[i], pagesize, bufsize[i] * pagesize); if (ret) { fprintf(stderr, "Failed to allocate read buffer\n"); return 1; } } printf("Starting io_uring reads...\n"); ret = io_uring_queue_init(QUEUE_SIZE, &ring, 0); if (ret != 0) { fprintf(stderr, "Failed to create io_uring queue\n"); return 1; } for (i = 0 ; i < QUEUE_SIZE; i++) { iovec[i].iov_base = buffers[i]; iovec[i].iov_len = bufsize[i] * pagesize; if (submit_read(&ring, fd, i)) return 1; } while (total++ < 10000000) { if (total % 1000 == 0) printf("%zd\n", total); ret = io_uring_wait_cqe(&ring, &cqe); if (ret < 0) { fprintf(stderr, "Failed at io_uring_wait_cqe()\n"); return 1; } i = (intptr_t)io_uring_cqe_get_data(cqe); if (cqe->res < 0) { fprintf(stderr, "bad read result for io %d, offset %zd: %d\n", i, offsets[i] * pagesize, cqe->res); breakme(); return 1; } if (cqe->res != bufsize[i] * pagesize) { fprintf(stderr, "bad read result for io %d, offset %zd: %d should be %zd\n", i, offsets[i] * pagesize, cqe->res, bufsize[i] * pagesize); breakme(); return 1; } io_uring_cqe_seen(&ring, cqe); // resubmit if (submit_read(&ring, fd, i)) return 1; } io_uring_queue_exit(&ring); return 0; }