On Mon, Nov 13, 2023 at 09:08:39AM -0800, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > XFS has a rather slow reflink operation. While a reflink operation is > running, other programs cannot read the contents of the source file, > which is causing latency spikes. Catherine Hoang wrote a patch to > permit reads, since the source file contents do not change. This is a > functionality test for that patch. > > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> > Reviewed-by: Christoph Hellwig <hch@xxxxxx> > Reviewed-by: Catherine Hoang <catherine.hoang@xxxxxxxxxx> > --- Thanks Darrick, this version looks good to me. Reviewed-by: Zorro Lang <zlang@xxxxxxxxxx> > configure.ac | 1 > include/builddefs.in | 1 > m4/package_libcdev.m4 | 13 ++ > src/Makefile | 4 + > src/t_reflink_read_race.c | 339 +++++++++++++++++++++++++++++++++++++++++++++ > tests/generic/1953 | 75 ++++++++++ > tests/generic/1953.out | 6 + > 7 files changed, 439 insertions(+) > create mode 100644 src/t_reflink_read_race.c > create mode 100755 tests/generic/1953 > create mode 100644 tests/generic/1953.out > > > diff --git a/configure.ac b/configure.ac > index 4687d8a3c0..7333045330 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -71,6 +71,7 @@ AC_HAVE_BMV_OF_SHARED > AC_HAVE_NFTW > AC_HAVE_RLIMIT_NOFILE > AC_HAVE_FIEXCHANGE > +AC_HAVE_FICLONE > > AC_CHECK_FUNCS([renameat2]) > AC_CHECK_FUNCS([reallocarray]) > diff --git a/include/builddefs.in b/include/builddefs.in > index 969acf0da2..446350d5fc 100644 > --- a/include/builddefs.in > +++ b/include/builddefs.in > @@ -73,6 +73,7 @@ HAVE_NFTW = @have_nftw@ > HAVE_BMV_OF_SHARED = @have_bmv_of_shared@ > HAVE_RLIMIT_NOFILE = @have_rlimit_nofile@ > HAVE_FIEXCHANGE = @have_fiexchange@ > +HAVE_FICLONE = @have_ficlone@ > > GCCFLAGS = -funsigned-char -fno-strict-aliasing -Wall > > diff --git a/m4/package_libcdev.m4 b/m4/package_libcdev.m4 > index 7f73104405..0f4b8063f3 100644 > --- a/m4/package_libcdev.m4 > +++ b/m4/package_libcdev.m4 > @@ -174,3 +174,16 @@ AC_DEFUN([AC_HAVE_FIEXCHANGE], > AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)]) > AC_SUBST(have_fiexchange) > ]) > + > +# Check if we have FICLONE > +AC_DEFUN([AC_HAVE_FICLONE], > + [ AC_MSG_CHECKING([for FICLONE]) > + AC_LINK_IFELSE([AC_LANG_PROGRAM([[ > +#include <sys/ioctl.h> > +#include <linux/fs.h> > + ]], [[ > + ioctl(-1, FICLONE, -1); > + ]])],[have_ficlone=yes > + AC_MSG_RESULT(yes)],[AC_MSG_RESULT(no)]) > + AC_SUBST(have_ficlone) > + ]) > diff --git a/src/Makefile b/src/Makefile > index 2815f919b2..49dd2f6c1e 100644 > --- a/src/Makefile > +++ b/src/Makefile > @@ -94,6 +94,10 @@ ifeq ($(HAVE_SEEK_DATA), yes) > endif > endif > > +ifeq ($(HAVE_FICLONE),yes) > + TARGETS += t_reflink_read_race > +endif > + > CFILES = $(TARGETS:=.c) > LDIRT = $(TARGETS) fssum > > diff --git a/src/t_reflink_read_race.c b/src/t_reflink_read_race.c > new file mode 100644 > index 0000000000..00c19d7c05 > --- /dev/null > +++ b/src/t_reflink_read_race.c > @@ -0,0 +1,339 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Race reads with reflink to see if reads continue while we're cloning. > + * Copyright 2023 Oracle. All rights reserved. > + */ > +#include <sys/types.h> > +#include <sys/stat.h> > +#include <sys/ioctl.h> > +#include <linux/fs.h> > +#include <time.h> > +#include <stdlib.h> > +#include <unistd.h> > +#include <signal.h> > +#include <fcntl.h> > +#include <pthread.h> > +#include <stdio.h> > +#include <errno.h> > +#include <string.h> > + > +static pid_t mypid; > + > +static FILE *outcome_fp; > + > +/* Significant timestamps. Initialized to zero */ > +static double program_start, clone_start, clone_end; > + > +/* Coordinates access to timestamps */ > +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; > + > +struct readinfo { > + int fd; > + unsigned int blocksize; > + char *descr; > +}; > + > +struct cloneinfo { > + int src_fd, dst_fd; > +}; > + > +#define MSEC_PER_SEC 1000 > +#define NSEC_PER_MSEC 1000000 > + > +/* > + * Assume that it shouldn't take longer than 100ms for the FICLONE ioctl to > + * enter the kernel and take locks on an uncontended file. This is also the > + * required CLOCK_MONOTONIC granularity. > + */ > +#define EARLIEST_READ_MS (MSEC_PER_SEC / 10) > + > +/* > + * We want to be reasonably sure that the FICLONE takes long enough that any > + * read initiated after clone operation locked the source file could have > + * completed a disk IO before the clone finishes. Therefore, we require that > + * the clone operation must take at least 4x the maximum setup time. > + */ > +#define MINIMUM_CLONE_MS (EARLIEST_READ_MS * 5) > + > +static double timespec_to_msec(const struct timespec *ts) > +{ > + return (ts->tv_sec * (double)MSEC_PER_SEC) + > + (ts->tv_nsec / (double)NSEC_PER_MSEC); > +} > + > +static void sleep_ms(unsigned int len) > +{ > + struct timespec time = { > + .tv_nsec = len * NSEC_PER_MSEC, > + }; > + > + nanosleep(&time, NULL); > +} > + > +static void outcome(const char *str) > +{ > + fprintf(outcome_fp, "%s\n", str); > + fflush(outcome_fp); > +} > + > +static void *reader_fn(void *arg) > +{ > + struct timespec now; > + struct readinfo *ri = arg; > + char *buf = malloc(ri->blocksize); > + loff_t pos = 0; > + ssize_t copied; > + int ret; > + > + if (!buf) { > + perror("alloc buffer"); > + goto kill_error; > + } > + > + /* Wait for the FICLONE to start */ > + pthread_mutex_lock(&lock); > + while (clone_start < program_start) { > + pthread_mutex_unlock(&lock); > +#ifdef DEBUG > + printf("%s read waiting for clone to start; cs=%.2f ps=%.2f\n", > + ri->descr, clone_start, program_start); > + fflush(stdout); > +#endif > + sleep_ms(1); > + pthread_mutex_lock(&lock); > + } > + pthread_mutex_unlock(&lock); > + sleep_ms(EARLIEST_READ_MS); > + > + /* Read from the file... */ > + while ((copied = read(ri->fd, buf, ri->blocksize)) > 0) { > + double read_completion; > + > + ret = clock_gettime(CLOCK_MONOTONIC, &now); > + if (ret) { > + perror("clock_gettime"); > + goto kill_error; > + } > + read_completion = timespec_to_msec(&now); > + > + /* > + * If clone_end is still zero, the FICLONE is still running. > + * If the read completion occurred a safe duration after the > + * start of the ioctl, then report that as an early read > + * completion. > + */ > + pthread_mutex_lock(&lock); > + if (clone_end < program_start && > + read_completion > clone_start + EARLIEST_READ_MS) { > + pthread_mutex_unlock(&lock); > + > + /* clone still running... */ > + printf("finished %s read early at %.2fms\n", > + ri->descr, > + read_completion - program_start); > + fflush(stdout); > + outcome("finished read early"); > + goto kill_done; > + } > + pthread_mutex_unlock(&lock); > + > + sleep_ms(1); > + pos += copied; > + } > + if (copied < 0) { > + perror("read"); > + goto kill_error; > + } > + > + return NULL; > +kill_error: > + outcome("reader error"); > +kill_done: > + kill(mypid, SIGTERM); > + return NULL; > +} > + > +static void *clone_fn(void *arg) > +{ > + struct timespec now; > + struct cloneinfo *ci = arg; > + int ret; > + > + /* Record the approximate start time of this thread */ > + ret = clock_gettime(CLOCK_MONOTONIC, &now); > + if (ret) { > + perror("clock_gettime clone start"); > + goto kill_error; > + } > + pthread_mutex_lock(&lock); > + clone_start = timespec_to_msec(&now); > + pthread_mutex_unlock(&lock); > + > + printf("started clone at %.2fms\n", clone_start - program_start); > + fflush(stdout); > + > + /* Kernel call, only killable with a fatal signal */ > + ret = ioctl(ci->dst_fd, FICLONE, ci->src_fd); > + if (ret) { > + perror("FICLONE"); > + goto kill_error; > + } > + > + /* If the ioctl completes, note the completion time */ > + ret = clock_gettime(CLOCK_MONOTONIC, &now); > + if (ret) { > + perror("clock_gettime clone end"); > + goto kill_error; > + } > + > + pthread_mutex_lock(&lock); > + clone_end = timespec_to_msec(&now); > + pthread_mutex_unlock(&lock); > + > + printf("finished clone at %.2fms\n", > + clone_end - program_start); > + fflush(stdout); > + > + /* Complain if we didn't take long enough to clone. */ > + if (clone_end < clone_start + MINIMUM_CLONE_MS) { > + printf("clone did not take enough time (%.2fms)\n", > + clone_end - clone_start); > + fflush(stdout); > + outcome("clone too fast"); > + goto kill_done; > + } > + > + outcome("clone finished before any reads"); > + goto kill_done; > + > +kill_error: > + outcome("clone error"); > +kill_done: > + kill(mypid, SIGTERM); > + return NULL; > +} > + > +int main(int argc, char *argv[]) > +{ > + struct cloneinfo ci; > + struct readinfo bufio = { .descr = "buffered" }; > + struct readinfo directio = { .descr = "directio" }; > + struct timespec now; > + pthread_t clone_thread, bufio_thread, directio_thread; > + double clockres; > + int ret; > + > + if (argc != 4) { > + printf("Usage: %s src_file dst_file outcome_file\n", argv[0]); > + return 1; > + } > + > + mypid = getpid(); > + > + /* Check for sufficient clock precision. */ > + ret = clock_getres(CLOCK_MONOTONIC, &now); > + if (ret) { > + perror("clock_getres MONOTONIC"); > + return 2; > + } > + printf("CLOCK_MONOTONIC resolution: %llus %luns\n", > + (unsigned long long)now.tv_sec, > + (unsigned long)now.tv_nsec); > + fflush(stdout); > + > + clockres = timespec_to_msec(&now); > + if (clockres > EARLIEST_READ_MS) { > + fprintf(stderr, "insufficient CLOCK_MONOTONIC resolution\n"); > + return 2; > + } > + > + /* > + * Measure program start time since the MONOTONIC time base is not > + * all that well defined. > + */ > + ret = clock_gettime(CLOCK_MONOTONIC, &now); > + if (ret) { > + perror("clock_gettime MONOTONIC"); > + return 2; > + } > + if (now.tv_sec == 0 && now.tv_nsec == 0) { > + fprintf(stderr, "Uhoh, start time is zero?!\n"); > + return 2; > + } > + program_start = timespec_to_msec(&now); > + > + outcome_fp = fopen(argv[3], "w"); > + if (!outcome_fp) { > + perror(argv[3]); > + return 2; > + } > + > + /* Open separate fds for each thread */ > + ci.src_fd = open(argv[1], O_RDONLY); > + if (ci.src_fd < 0) { > + perror(argv[1]); > + return 2; > + } > + > + ci.dst_fd = open(argv[2], O_RDWR | O_CREAT, 0600); > + if (ci.dst_fd < 0) { > + perror(argv[2]); > + return 2; > + } > + > + bufio.fd = open(argv[1], O_RDONLY); > + if (bufio.fd < 0) { > + perror(argv[1]); > + return 2; > + } > + bufio.blocksize = 37; > + > + directio.fd = open(argv[1], O_RDONLY | O_DIRECT); > + if (directio.fd < 0) { > + perror(argv[1]); > + return 2; > + } > + directio.blocksize = 512; > + > + /* Start threads */ > + ret = pthread_create(&clone_thread, NULL, clone_fn, &ci); > + if (ret) { > + fprintf(stderr, "create clone thread: %s\n", strerror(ret)); > + return 2; > + } > + > + ret = pthread_create(&bufio_thread, NULL, reader_fn, &bufio); > + if (ret) { > + fprintf(stderr, "create bufio thread: %s\n", strerror(ret)); > + return 2; > + } > + > + ret = pthread_create(&directio_thread, NULL, reader_fn, &directio); > + if (ret) { > + fprintf(stderr, "create directio thread: %s\n", strerror(ret)); > + return 2; > + } > + > + /* Wait for threads */ > + ret = pthread_join(clone_thread, NULL); > + if (ret) { > + fprintf(stderr, "join clone thread: %s\n", strerror(ret)); > + return 2; > + } > + > + ret = pthread_join(bufio_thread, NULL); > + if (ret) { > + fprintf(stderr, "join bufio thread: %s\n", strerror(ret)); > + return 2; > + } > + > + ret = pthread_join(directio_thread, NULL); > + if (ret) { > + fprintf(stderr, "join directio thread: %s\n", strerror(ret)); > + return 2; > + } > + > + printf("Program should have killed itself?\n"); > + outcome("program failed to end early"); > + return 0; > +} > diff --git a/tests/generic/1953 b/tests/generic/1953 > new file mode 100755 > index 0000000000..c14de7060d > --- /dev/null > +++ b/tests/generic/1953 > @@ -0,0 +1,75 @@ > +#! /bin/bash > +# SPDX-License-Identifier: GPL-2.0 > +# Copyright (c) 2023, Oracle and/or its affiliates. All Rights Reserved. > +# > +# FS QA Test No. 1953 > +# > +# Race file reads with a very slow reflink operation to see if the reads > +# actually complete while the reflink is ongoing. This is a functionality > +# test for XFS commit f3ba4762fa56 "xfs: allow read IO and FICLONE to run > +# concurrently". > +# > +. ./common/preamble > +_begin_fstest auto clone punch > + > +# Import common functions. > +. ./common/filter > +. ./common/attr > +. ./common/reflink > + > +# real QA test starts here > +_require_scratch_reflink > +_require_cp_reflink > +_require_xfs_io_command "fpunch" > +_require_test_program "punch-alternating" > +_require_test_program "t_reflink_read_race" > +_require_command "$TIMEOUT_PROG" timeout > + > +rm -f "$seqres.full" > + > +echo "Format and mount" > +_scratch_mkfs > "$seqres.full" 2>&1 > +_scratch_mount >> "$seqres.full" 2>&1 > + > +testdir="$SCRATCH_MNT/test-$seq" > +mkdir "$testdir" > + > +calc_space() { > + blocks_needed=$(( 2 ** (fnr + 1) )) > + space_needed=$((blocks_needed * blksz * 5 / 4)) > +} > + > +# Figure out the number of blocks that we need to get the reflink runtime above > +# 1 seconds > +echo "Create a many-block file" > +for ((fnr = 1; fnr < 40; fnr++)); do > + free_blocks=$(stat -f -c '%a' "$testdir") > + blksz=$(_get_file_block_size "$testdir") > + space_avail=$((free_blocks * blksz)) > + calc_space > + test $space_needed -gt $space_avail && \ > + _notrun "Insufficient space for stress test; would only create $blocks_needed extents." > + > + off=$(( (2 ** fnr) * blksz)) > + $XFS_IO_PROG -f -c "pwrite -S 0x61 -b 4194304 $off $off" "$testdir/file1" >> "$seqres.full" > + "$here/src/punch-alternating" "$testdir/file1" >> "$seqres.full" > + > + $TIMEOUT_PROG 1s cp --reflink=always "$testdir/file1" "$testdir/garbage" || break > +done > +echo "fnr=$fnr" >> $seqres.full > + > +echo "Reflink the big file" > +$here/src/t_reflink_read_race "$testdir/file1" "$testdir/file2" \ > + "$testdir/outcome" &>> $seqres.full > + > +if [ ! -e "$testdir/outcome" ]; then > + echo "Could not set up program" > +elif grep -q "finished read early" "$testdir/outcome"; then > + echo "test completed successfully" > +else > + cat "$testdir/outcome" > +fi > + > +# success, all done > +status=0 > +exit > diff --git a/tests/generic/1953.out b/tests/generic/1953.out > new file mode 100644 > index 0000000000..8eaacb7ff0 > --- /dev/null > +++ b/tests/generic/1953.out > @@ -0,0 +1,6 @@ > +QA output created by 1953 > +Format and mount > +Create a many-block file > +Reflink the big file > +Terminated > +test completed successfully >