On 09 Mar 2021 at 10:10, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > Create a test to make sure that dedupe actually locks the file ranges > correctly before starting the content comparison and keeps them locked > until the operation completes. > Looks good to me. Reviewed-by: Chandan Babu R <chandanrlinux@xxxxxxxxx> > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> > --- > src/Makefile | 2 > src/deduperace.c | 370 +++++++++++++++++++++++++++++++++++++++++++++++++ > tests/generic/949 | 51 +++++++ > tests/generic/949.out | 2 > tests/generic/group | 1 > 5 files changed, 425 insertions(+), 1 deletion(-) > create mode 100644 src/deduperace.c > create mode 100755 tests/generic/949 > create mode 100644 tests/generic/949.out > > > diff --git a/src/Makefile b/src/Makefile > index 811b24e4..38ee6718 100644 > --- a/src/Makefile > +++ b/src/Makefile > @@ -21,7 +21,7 @@ TARGETS = dirstress fill fill2 getpagesize holes lstat64 \ > > LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \ > preallo_rw_pattern_writer ftrunc trunc fs_perms testx looptest \ > - locktest unwritten_mmap bulkstat_unlink_test \ > + locktest unwritten_mmap bulkstat_unlink_test deduperace \ > bulkstat_unlink_test_modified t_dir_offset t_futimens t_immutable \ > stale_handle pwrite_mmap_blocked t_dir_offset2 seek_sanity_test \ > seek_copy_test t_readdir_1 t_readdir_2 fsync-tester nsexec cloner \ > diff --git a/src/deduperace.c b/src/deduperace.c > new file mode 100644 > index 00000000..b252d436 > --- /dev/null > +++ b/src/deduperace.c > @@ -0,0 +1,370 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (c) 2021 Oracle. All Rights Reserved. > + * Author: Darrick J. Wong <djwong@xxxxxxxxxx> > + * > + * Race pwrite/mwrite with dedupe to see if we got the locking right. > + * > + * File writes and mmap writes should not be able to change the src_fd's > + * contents after dedupe prep has verified that the file contents are the same. > + */ > +#include <sys/types.h> > +#include <sys/stat.h> > +#include <sys/mman.h> > +#include <sys/ioctl.h> > +#include <linux/fs.h> > +#include <string.h> > +#include <stdio.h> > +#include <unistd.h> > +#include <fcntl.h> > +#include <pthread.h> > +#include <stdlib.h> > +#include <errno.h> > + > +#define GOOD_BYTE 0x58 > +#define BAD_BYTE 0x66 > + > +#ifndef FIDEDUPERANGE > +/* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */ > +#define FILE_DEDUPE_RANGE_SAME 0 > +#define FILE_DEDUPE_RANGE_DIFFERS 1 > + > +/* from struct btrfs_ioctl_file_extent_same_info */ > +struct file_dedupe_range_info { > + __s64 dest_fd; /* in - destination file */ > + __u64 dest_offset; /* in - start of extent in destination */ > + __u64 bytes_deduped; /* out - total # of bytes we were able > + * to dedupe from this file. */ > + /* status of this dedupe operation: > + * < 0 for error > + * == FILE_DEDUPE_RANGE_SAME if dedupe succeeds > + * == FILE_DEDUPE_RANGE_DIFFERS if data differs > + */ > + __s32 status; /* out - see above description */ > + __u32 reserved; /* must be zero */ > +}; > + > +/* from struct btrfs_ioctl_file_extent_same_args */ > +struct file_dedupe_range { > + __u64 src_offset; /* in - start of extent in source */ > + __u64 src_length; /* in - length of extent */ > + __u16 dest_count; /* in - total elements in info array */ > + __u16 reserved1; /* must be zero */ > + __u32 reserved2; /* must be zero */ > + struct file_dedupe_range_info info[0]; > +}; > +#define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range) > +#endif /* FIDEDUPERANGE */ > + > +static int fd1, fd2; > +static loff_t offset = 37; /* Nice low offset to trick the compare */ > +static loff_t blksz; > + > +/* Continuously dirty the pagecache for the region being dupe-tested. */ > +void * > +mwriter( > + void *data) > +{ > + volatile char *p; > + > + p = mmap(NULL, blksz, PROT_WRITE, MAP_SHARED, fd1, 0); > + if (p == MAP_FAILED) { > + perror("mmap"); > + exit(2); > + } > + > + while (1) { > + *(p + offset) = BAD_BYTE; > + *(p + offset) = GOOD_BYTE; > + } > +} > + > +/* Continuously write to the region being dupe-tested. */ > +void * > +pwriter( > + void *data) > +{ > + char v; > + ssize_t sz; > + > + while (1) { > + v = BAD_BYTE; > + sz = pwrite(fd1, &v, sizeof(v), offset); > + if (sz != sizeof(v)) { > + perror("pwrite0"); > + exit(2); > + } > + > + v = GOOD_BYTE; > + sz = pwrite(fd1, &v, sizeof(v), offset); > + if (sz != sizeof(v)) { > + perror("pwrite1"); > + exit(2); > + } > + } > + > + return NULL; > +} > + > +static inline void > +complain( > + loff_t offset, > + char bad) > +{ > + fprintf(stderr, "ASSERT: offset %llu should be 0x%x, got 0x%x!\n", > + (unsigned long long)offset, GOOD_BYTE, bad); > + abort(); > +} > + > +/* Make sure the destination file pagecache never changes. */ > +void * > +mreader( > + void *data) > +{ > + volatile char *p; > + > + p = mmap(NULL, blksz, PROT_READ, MAP_SHARED, fd2, 0); > + if (p == MAP_FAILED) { > + perror("mmap"); > + exit(2); > + } > + > + while (1) { > + if (*(p + offset) != GOOD_BYTE) > + complain(offset, *(p + offset)); > + } > +} > + > +/* Make sure the destination file never changes. */ > +void * > +preader( > + void *data) > +{ > + char v; > + ssize_t sz; > + > + while (1) { > + sz = pread(fd2, &v, sizeof(v), offset); > + if (sz != sizeof(v)) { > + perror("pwrite0"); > + exit(2); > + } > + > + if (v != GOOD_BYTE) > + complain(offset, v); > + } > + > + return NULL; > +} > + > +void > +print_help(const char *progname) > +{ > + printf("Usage: %s [-b blksz] [-c dir] [-n nr_ops] [-o offset] [-r] [-w] [-v]\n", > + progname); > + printf("-b sets the block size (default is autoconfigured)\n"); > + printf("-c chdir to this path before starting\n"); > + printf("-n controls the number of dedupe ops (default 10000)\n"); > + printf("-o reads and writes to this offset (default 37)\n"); > + printf("-r uses pread instead of mmap read.\n"); > + printf("-v prints status updates.\n"); > + printf("-w uses pwrite instead of mmap write.\n"); > +} > + > +int > +main( > + int argc, > + char *argv[]) > +{ > + struct file_dedupe_range *fdr; > + char *Xbuf; > + void *(*reader_fn)(void *) = mreader; > + void *(*writer_fn)(void *) = mwriter; > + unsigned long same = 0; > + unsigned long differs = 0; > + unsigned long i, nr_ops = 10000; > + ssize_t sz; > + pthread_t reader, writer; > + int verbose = 0; > + int c; > + int ret; > + > + while ((c = getopt(argc, argv, "b:c:n:o:rvw")) != -1) { > + switch (c) { > + case 'b': > + errno = 0; > + blksz = strtoul(optarg, NULL, 0); > + if (errno) { > + perror(optarg); > + exit(1); > + } > + break; > + case 'c': > + ret = chdir(optarg); > + if (ret) { > + perror("chdir"); > + exit(1); > + } > + break; > + case 'n': > + errno = 0; > + nr_ops = strtoul(optarg, NULL, 0); > + if (errno) { > + perror(optarg); > + exit(1); > + } > + break; > + case 'o': > + errno = 0; > + offset = strtoul(optarg, NULL, 0); > + if (errno) { > + perror(optarg); > + exit(1); > + } > + break; > + case 'r': > + reader_fn = preader; > + break; > + case 'v': > + verbose = 1; > + break; > + case 'w': > + writer_fn = pwriter; > + break; > + default: > + print_help(argv[0]); > + exit(1); > + break; > + } > + } > + > + fdr = malloc(sizeof(struct file_dedupe_range) + > + sizeof(struct file_dedupe_range_info)); > + if (!fdr) { > + perror("malloc"); > + exit(1); > + } > + > + /* Initialize both files. */ > + fd1 = open("file1", O_RDWR | O_CREAT | O_TRUNC | O_NOATIME, 0600); > + if (fd1 < 0) { > + perror("file1"); > + exit(1); > + } > + > + fd2 = open("file2", O_RDWR | O_CREAT | O_TRUNC | O_NOATIME, 0600); > + if (fd2 < 0) { > + perror("file2"); > + exit(1); > + } > + > + if (blksz <= 0) { > + struct stat statbuf; > + > + ret = fstat(fd1, &statbuf); > + if (ret) { > + perror("file1 stat"); > + exit(1); > + } > + blksz = statbuf.st_blksize; > + } > + > + if (offset >= blksz) { > + fprintf(stderr, "offset (%llu) < blksz (%llu)?\n", > + (unsigned long long)offset, > + (unsigned long long)blksz); > + exit(1); > + } > + > + Xbuf = malloc(blksz); > + if (!Xbuf) { > + perror("malloc buffer"); > + exit(1); > + } > + memset(Xbuf, GOOD_BYTE, blksz); > + > + sz = pwrite(fd1, Xbuf, blksz, 0); > + if (sz != blksz) { > + perror("file1 write"); > + exit(1); > + } > + > + sz = pwrite(fd2, Xbuf, blksz, 0); > + if (sz != blksz) { > + perror("file2 write"); > + exit(1); > + } > + > + ret = fsync(fd1); > + if (ret) { > + perror("file1 fsync"); > + exit(1); > + } > + > + ret = fsync(fd2); > + if (ret) { > + perror("file2 fsync"); > + exit(1); > + } > + > + /* Start our reader and writer threads. */ > + ret = pthread_create(&reader, NULL, reader_fn, NULL); > + if (ret) { > + fprintf(stderr, "rthread: %s\n", strerror(ret)); > + exit(1); > + } > + > + ret = pthread_create(&writer, NULL, writer_fn, NULL); > + if (ret) { > + fprintf(stderr, "wthread: %s\n", strerror(ret)); > + exit(1); > + } > + > + /* > + * Now start deduping. If the contents match, fd1's blocks will be > + * remapped into fd2, which is why the writer thread targets fd1 and > + * the reader checks fd2 to make sure that none of fd1's writes ever > + * make it into fd2. > + */ > + for (i = 1; i <= nr_ops; i++) { > + fdr->src_offset = 0; > + fdr->src_length = blksz; > + fdr->dest_count = 1; > + fdr->reserved1 = 0; > + fdr->reserved2 = 0; > + fdr->info[0].dest_fd = fd2; > + fdr->info[0].dest_offset = 0; > + fdr->info[0].reserved = 0; > + > + ret = ioctl(fd1, FIDEDUPERANGE, fdr); > + if (ret) { > + perror("dedupe"); > + exit(2); > + } > + > + switch (fdr->info[0].status) { > + case FILE_DEDUPE_RANGE_DIFFERS: > + differs++; > + break; > + case FILE_DEDUPE_RANGE_SAME: > + same++; > + break; > + default: > + fprintf(stderr, "deduperange: %s\n", > + strerror(-fdr->info[0].status)); > + exit(2); > + break; > + } > + > + if (verbose && (i % 337) == 0) > + printf("nr_ops: %lu; same: %lu; differs: %lu\n", > + i, same, differs); > + } > + > + if (verbose) > + printf("nr_ops: %lu; same: %lu; differs: %lu\n", i - 1, same, > + differs); > + > + /* Program termination will kill the threads and close the files. */ > + return 0; > +} > diff --git a/tests/generic/949 b/tests/generic/949 > new file mode 100755 > index 00000000..3951490b > --- /dev/null > +++ b/tests/generic/949 > @@ -0,0 +1,51 @@ > +#! /bin/bash > +# SPDX-License-Identifier: GPL-2.0-or-later > +# Copyright (c) 2021 Oracle. All Rights Reserved. > +# > +# FS QA Test No. 949 > +# > +# Make sure that mmap and file writers racing with FIDEDUPERANGE cannot write > +# to the file after the dedupe prep function has decided that the file contents > +# are identical and we can therefore go ahead with the remapping. > + > +seq=`basename $0` > +seqres=$RESULT_DIR/$seq > +echo "QA output created by $seq" > + > +here=`pwd` > +tmp=/tmp/$$ > +status=1 # failure is the default! > +trap "_cleanup; exit \$status" 0 1 2 3 15 > + > +_cleanup() > +{ > + cd / > + rm -f $tmp.* > +} > + > +# get standard environment, filters and checks > +. ./common/rc > +. ./common/reflink > + > +# real QA test starts here > +_supported_fs generic > +_require_scratch_dedupe > + > +rm -f $seqres.full > + > +nr_ops=$((TIME_FACTOR * 10000)) > + > +# Format filesystem > +_scratch_mkfs > $seqres.full > +_scratch_mount > + > +# Test once with mmap writes > +$here/src/deduperace -c $SCRATCH_MNT -n $nr_ops > + > +# Test again with pwrites for the lulz > +$here/src/deduperace -c $SCRATCH_MNT -n $nr_ops -w > + > +echo Silence is golden. > +# success, all done > +status=0 > +exit > diff --git a/tests/generic/949.out b/tests/generic/949.out > new file mode 100644 > index 00000000..2998b46c > --- /dev/null > +++ b/tests/generic/949.out > @@ -0,0 +1,2 @@ > +QA output created by 949 > +Silence is golden. > diff --git a/tests/generic/group b/tests/generic/group > index d5cfdd51..778aa8c4 100644 > --- a/tests/generic/group > +++ b/tests/generic/group > @@ -630,3 +630,4 @@ > 625 auto quick verity > 947 auto quick rw clone > 948 auto quick rw copy_range > +949 auto quick rw dedupe clone -- chandan