Re: read corruption with qemu master io_uring engine / linux master / btrfs(?)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Dominique MARTINET wrote on Wed, Jun 29, 2022 at 09:35:44AM +0900:
> I also agree writing a simple program like the io_uring test in the
> above commit that'd sort of do it like qemu and compare contents would
> be ideal.
> I'll have a stab at this today.

Okay, after half a day failing to reproduce I had a closer look at qemu
and... it's a qemu bug.

Well, there probably are two bugs, but one should be benign:

 - qemu short read handling was... rather disappointing.
Patch should appear here[1] eventually, but as it seems moderated?
I'm reposting it here:
-----
diff --git a/block/io_uring.c b/block/io_uring.c
index d48e472e74cb..d58aff9615ce 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -103,7 +103,7 @@ static void luring_resubmit_short_read(LuringState *s, LuringAIOCB *luringcb,
                       remaining);
 
     /* Update sqe */
-    luringcb->sqeq.off = nread;
+    luringcb->sqeq.off += nread;
     luringcb->sqeq.addr = (__u64)(uintptr_t)luringcb->resubmit_qiov.iov;
     luringcb->sqeq.len = luringcb->resubmit_qiov.niov;
 
-----
 (basically "just" a typo, but that must have never been tested!)
[1] https://lore.kernel.org/qemu-devel/20220629044957.1998430-1-dominique.martinet@xxxxxxxxxxxxxxxxx


 - comments there also say short reads should never happen on newer
kernels (assuming local filesystems?) -- how true is that? If we're
doing our best kernel side to avoid short reads I guess we probably
ought to have a look at this.

It can easily be reproduced with a simple io_uring program -- see
example attached that eventually fails with the following error on
btrfs:
bad read result for io 8, offset 792227840: 266240 should be 1466368

but doesn't fail on tmpfs or without O_DIRECT.
feel free to butcher it, it's already a quickly hacked downversion of my
original test that had hash computation etc so the flow might feel a bit
weird.
Just compile with `gcc -o shortreads uring_shortreads.c -luring` and run
with file to read in argument.


Thanks!
-- 
Dominique
/* Get O_DIRECT */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <liburing.h>
#include <sys/random.h>
#include <sys/stat.h>

long pagesize;
size_t n_blocks;
#define QUEUE_SIZE 10
char *buffers[QUEUE_SIZE];
int bufsize[QUEUE_SIZE];
struct iovec iovec[QUEUE_SIZE];
long int offsets[QUEUE_SIZE];

void breakme(void) {
}

int submit_read(struct io_uring *ring, int fd, int i) {
	struct io_uring_sqe *sqe;
	int ret;

	sqe = io_uring_get_sqe(ring);
	if (!sqe) {
		fprintf(stderr, "Failed to get io_uring sqe\n");
		return 1;
	}
	if (i == 0 || rand() % 2 == 0 || offsets[i-1] > n_blocks - bufsize[i]) {
		offsets[i] = rand() % (n_blocks - bufsize[i] + 1);
	} else {
		offsets[i] = offsets[i - 1];
	}
	io_uring_prep_readv(sqe, fd, iovec + i, 1, offsets[i] * pagesize);
	io_uring_sqe_set_data(sqe, (void*)(uintptr_t)i);
	ret = io_uring_submit(ring);
	if (ret != 1) {
		fprintf(stderr,	"submit failed\n");
		return 1;
	}
	return 0;
}

int getsize(int fd) {
	struct stat sb;
	if (fstat(fd, &sb)) {
		fprintf(stderr, "fstat: %m\n");
		return 1;
	}
	n_blocks = sb.st_size / pagesize;
	return 0;
}

int main(int argc, char *argv[])
{
	char *file, *mapfile;
	unsigned int seed;
	struct io_uring ring;
	struct io_uring_cqe *cqe;
	int fd, i;
	ssize_t ret;
	size_t total = 0;

	if (argc < 2 || argc > 3) {
		fprintf(stderr, "Use: %s <file> [<seed>]\n", argv[0]);
		return 1;
	}
	file = argv[1];
	if (argc == 3) {
		seed = atol(argv[2]);
	} else {
		getrandom(&seed, sizeof(seed), 0);
	}
	printf("random seed %u\n", seed);
	srand(seed);
	pagesize = sysconf(_SC_PAGE_SIZE);
	if (asprintf(&mapfile, "%s.map", file) < 0) {
		fprintf(stderr, "asprintf map %d\n", errno);
		return 1;
	}

	fd = open(file, O_RDONLY | O_DIRECT);
	if (fd == -1) {
		fprintf(stderr,
				"Failed to open file '%s': %s (errno %d)\n",
				file, strerror(errno), errno);
		return 1;
	}
	if (getsize(fd))
		return 1;

	for (i = 0 ; i < QUEUE_SIZE; i++) {
		bufsize[i] = (rand() % 1024) + 1;
		ret = posix_memalign((void**)&buffers[i], pagesize, bufsize[i] * pagesize);
		if (ret) {
			fprintf(stderr, "Failed to allocate read buffer\n");
			return 1;
		}
	}


	printf("Starting io_uring reads...\n");


	ret = io_uring_queue_init(QUEUE_SIZE, &ring, 0);
	if (ret != 0) {
		fprintf(stderr, "Failed to create io_uring queue\n");
		return 1;
	}


	for (i = 0 ; i < QUEUE_SIZE; i++) {
		iovec[i].iov_base = buffers[i];
		iovec[i].iov_len = bufsize[i] * pagesize;
		if (submit_read(&ring, fd, i))
			return 1;
	}

	while (total++ < 10000000) {
		if (total % 1000 == 0)
			printf("%zd\n", total);
		ret = io_uring_wait_cqe(&ring, &cqe);
		if (ret < 0) {
			fprintf(stderr, "Failed at io_uring_wait_cqe()\n");
			return 1;
		}
		i = (intptr_t)io_uring_cqe_get_data(cqe);
		if (cqe->res < 0) {
			fprintf(stderr, "bad read result for io %d, offset %zd: %d\n",
				i, offsets[i] * pagesize, cqe->res);
			breakme();
			return 1;
		}
		if (cqe->res != bufsize[i] * pagesize) {
			fprintf(stderr, "bad read result for io %d, offset %zd: %d should be %zd\n",
				i, offsets[i] * pagesize, cqe->res, bufsize[i] * pagesize);
			breakme();
			return 1;
		}
		io_uring_cqe_seen(&ring, cqe);

		// resubmit
		if (submit_read(&ring, fd, i))
			return 1;
	}
	io_uring_queue_exit(&ring);

	return 0;
}

[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux