Hello, all. I am experiencing a data corruption issue on Linux 6.1.24 when
calling fallocate with FALLOC_FL_PUNCH_HOLE to punch out pages that have
just been spliced into a pipe. It appears that the fallocate call can zero
out the pages that are sitting in the pipe buffer, before those pages are
read from the pipe.
Simplified code excerpt (eliding error checking):
int fd = /* open file descriptor referring to some disk file */;
for (off_t consumed = 0;;) {
ssize_t n = splice(fd, NULL, STDOUT_FILENO, NULL, SIZE_MAX, 0);
if (n <= 0) break;
consumed += n;
fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, consumed);
}
Expected behavior:
Punching holes in a file after splicing pages out of that file into a pipe
should not corrupt the spliced-out pages in the pipe buffer.
Observed behavior:
Some of the pages that have been spliced into the pipe get zeroed out by
the subsequent fallocate call before they can be consumed from the read
side of the pipe.
Steps to reproduce:
1. Save the attached ones.c, dontneed.c, and consume.c.
2. gcc -o ones ones.c
gcc -o dontneed dontneed.c
gcc -o consume consume.c
3. Fill a file with 32 MiB of 0xFF:
./ones | head -c$((1<<25)) >testfile
4. Evict the pages of the file from the page cache:
sync testfile && ./dontneed testfile
5. Splice the file into a pipe, punching out batches of pages after
splicing them:
./consume testfile | hexdump -C
The expected output from hexdump should show 32 MiB of 0xFF. Indeed, on my
system, if I omit the POSIX_FADV_DONTNEED advice, then I do get the
expected output. However, if the pages of the file are not already present
in the page cache (i.e., if the splice call faults them in from disk), then
the hexdump output shows some pages full of 0xFF and some pages full of
0x00.
Note #1: I am running a fairly antiquated x86_64 system. You may need to
use a file larger than 32 MiB to reproduce the misbehavior on a more modern
system. In particular, even I cannot reproduce the problem when I use 16
MiB. Conversely, when I use a very large file (too large to fit entirely in
the page cache), then I don't need the "dontneed" call to reproduce the
problem.
Note #2: I am siting my test file on an XFS file system running on a
hardware RAID volume at /dev/sda1. I'm not sure if that's relevant.#include <string.h>
#include <errno.h>
#include <error.h>
#include <fcntl.h>
#include <sysexits.h>
#include <unistd.h>
int main() {
char buf[4096];
memset(buf, 0xFF, sizeof buf);
for (;;)
if (write(STDOUT_FILENO, buf, sizeof buf) < 0)
error(EX_IOERR, errno, "write");
return EX_OK;
}
#include <stdio.h>
#include <errno.h>
#include <error.h>
#include <fcntl.h>
#include <sysexits.h>
#include <unistd.h>
int main(int argc, char *argv[]) {
if (argc < 2) {
printf("usage: %s <file> [...]\n", argc > 0 ? *argv : "dontneed");
return EX_USAGE;
}
int ret = EX_OK;
while (++argv, --argc > 0) {
int fd = open(*argv, O_RDONLY);
if (fd < 0) {
error(0, errno, "%s", *argv);
ret = ret ?: EX_NOINPUT;
continue;
}
if (posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED) < 0) {
error(0, errno, "%s: posix_fadvise", *argv);
ret = EX_OSERR;
}
if (close(fd) < 0)
error(EX_OSERR, errno, "%s: close", *argv);
}
return ret;
}
#define _GNU_SOURCE
#include <stdint.h>
#include <errno.h>
#include <error.h>
#include <fcntl.h>
#include <sysexits.h>
#include <unistd.h>
int main(int argc, char *argv[]) {
++argv, --argc;
do {
const char *filename = "<stdin>";
if (argc > 0) {
close(STDIN_FILENO);
if (open(filename = *argv, O_RDWR) != STDIN_FILENO)
error(EX_NOINPUT, errno, "%s", filename);
}
for (off_t consumed = 0;;) {
ssize_t n = splice(STDIN_FILENO, NULL, STDOUT_FILENO, NULL, SIZE_MAX, 0);
if (n <= 0) {
if (n < 0)
error(EX_IOERR, errno, "%s: splice", filename);
break;
}
if (fallocate(STDIN_FILENO, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, consumed += n) < 0)
error(EX_OSERR, errno, "%s: fallocate", filename);
}
if (argc > 0 && unlink(filename) < 0)
error(EX_OSERR, errno, "%s: unlink", filename);
} while (++argv, --argc > 0);
return EX_OK;
}