On Tuesday, 27 June 2023 01:47:57 EDT, Dave Chinner wrote:
On Mon, Jun 26, 2023 at 09:12:52PM -0400, Matt Whitlock wrote:
Hello, all. I am experiencing a data corruption issue on Linux 6.1.24 when
calling fallocate with FALLOC_FL_PUNCH_HOLE to punch out pages that have
just been spliced into a pipe. It appears that the fallocate call can zero
out the pages that are sitting in the pipe buffer, before those pages are
read from the pipe.
Simplified code excerpt (eliding error checking):
int fd = /* open file descriptor referring to some disk file */;
for (off_t consumed = 0;;) {
ssize_t n = splice(fd, NULL, STDOUT_FILENO, NULL, SIZE_MAX, 0);
if (n <= 0) break;
consumed += n;
fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, consumed);
}
Huh. Never seen that pattern before - what are you trying to
implement with this?
It's part of a tool I wrote that implements an indefinitely expandable
user-space pipe buffer backed by an unlinked-on-creation disk file. It's
very useful as a shim in a pipeline between a process that produces a large
but finite amount of data quickly and a process that consumes data slowly.
My canonical use case is in my nightly backup cronjob, where I have 'tar
-c' piped into 'xz' piped into a tool that uploads its stdin to an Amazon
S3-compatible data store. I insert my diskbuffer tool between tar and xz so
that the tar process can complete as quickly as possible (thus achieving a
snapshot as close to atomic as practically possible without freezing the
file system) and the xz process can take its sweet time crunching the
tarball down as small as possible, yet the disk space consumed by the
temporary (uncompressed) tarball can be released progressively as xz
consumes the tarball, and all of the disk space will be released
automatically if the pipeline dies, such as if the system is rebooted
during a backup run.
Conceptually:
tar -c /home | diskbuffer /var/tmp | xz -9e | s3upload ...
The 'diskbuffer' tool creates an unlinked temporary file (using
O_TMPFILE|O_EXCL) in the specified directory and then enters a loop,
splicing bytes from stdin into the file and splicing bytes from the file
into stdout, incrementally punching out the file as it successfully splices
to stdout.
Source code for the tool is attached. (It has a bit more functionality than
I have described here.)
#define _GNU_SOURCE
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <error.h>
#include <fcntl.h>
#include <poll.h>
#include <sysexits.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/statfs.h>
#include <sys/types.h>
static inline size_t splice_len(off_t len) {
return (size_t) (len > INT_MAX ? INT_MAX : len);
}
int main(int argc, char *argv[]) {
bool defer = false, keep = false;
if (argc > 1 && strcmp(argv[1], "--defer") == 0) {
defer = true;
argv[1] = argv[0];
--argc, ++argv;
}
if (argc > 1 && strcmp(argv[1], "--keep") == 0) {
keep = true;
argv[1] = argv[0];
--argc, ++argv;
}
if (argc > 2 || keep && argc < 2) {
fprintf(stderr, "usage: %s [--defer] { [<tmpdir>] | --keep <file> }\n", argv[0]);
return EX_USAGE;
}
const char *path;
int fd;
if (keep) {
if ((fd = open(argv[1], O_RDWR | O_CREAT | O_EXCL, 0666)) < 0) {
error(EX_NOINPUT, errno, "%s", argv[1]);
}
}
else {
if (argc > 1) {
path = argv[1];
}
else if (!(path = getenv("TMPDIR"))) {
path = "/tmp";
}
if ((fd = open(path, O_RDWR | O_TMPFILE | O_EXCL, 0000)) < 0) {
error(EX_NOINPUT, errno, "%s", path);
}
}
bool eof = false, punch = !keep;
off_t n_in = 0, n_out = 0, max = 0, mask = 0;
for (;;) {
off_t n_buf = n_in - n_out;
if (n_buf == max) {
struct statfs f;
if (fstatfs(fd, &f) < 0) {
error(EX_NOINPUT, errno, "%s", path);
}
max = f.f_bavail / 2 * f.f_bsize;
mask = ~((off_t) f.f_bsize - 1);
}
struct pollfd pfds[2] = { };
nfds_t nfds = 0;
if (!eof && n_buf < max) {
pfds[nfds].fd = STDIN_FILENO;
pfds[nfds].events = POLLIN;
++nfds;
}
if (n_buf > 0 && (!defer || eof)) {
pfds[nfds].fd = STDOUT_FILENO;
pfds[nfds].events = POLLOUT;
++nfds;
}
if (nfds == 0) {
if (eof) {
break;
}
sleep(1);
continue;
}
if (poll(pfds, nfds, -1) > 0) {
for (nfds_t i = 0; i < nfds; ++i) {
if (pfds[i].revents) {
if (pfds[i].fd == STDIN_FILENO) {
ssize_t n = splice(STDIN_FILENO, NULL, fd, &n_in, splice_len(max - n_buf), SPLICE_F_MOVE | SPLICE_F_NONBLOCK | SPLICE_F_MORE);
if (n <= 0) {
if (n == 0) {
eof = true;
}
else if (errno != EAGAIN) {
error(EX_IOERR, errno, "%s: splice", "<stdin>");
}
}
else {
n_buf += n;
}
}
else if (pfds[i].fd == STDOUT_FILENO) {
ssize_t n = splice(fd, NULL, STDOUT_FILENO, NULL, splice_len(n_buf), SPLICE_F_MOVE | SPLICE_F_NONBLOCK | SPLICE_F_MORE);
if (n <= 0) {
if (n < 0 && errno != EAGAIN) {
error(EX_IOERR, errno, "%s: splice", "<stdout>");
}
}
else if (((n_out += n) & mask) && punch && fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, n_out & mask) < 0) {
if (errno != EOPNOTSUPP && errno != ENOSYS) {
error(EX_IOERR, errno, "%s: fallocate", path);
}
fprintf(stderr, "WARNING: temp file in %s does not support FALLOC_FL_PUNCH_HOLE\n", path);
punch = false;
}
}
}
}
}
else if (errno != EINTR) {
error(EX_OSERR, errno, "poll");
}
}
return EX_OK;
}