On Tue, Oct 8, 2024 at 12:26 PM Mark Harmstone <maharmstone@xxxxxx> wrote: > > Adds a test for a bug we encountered on Linux 6.4 on aarch64, where a > race could mean that csums weren't getting written to the log tree, > leading to corruption when it was replayed. > > The patches to detect log this tree corruption are in btrfs-progs 6.11. > > Signed-off-by: Mark Harmstone <maharmstone@xxxxxx> > --- > common/dmlogwrites | 24 ++++++++++++++++++ > tests/btrfs/192 | 26 +------------------- > tests/btrfs/333 | 59 +++++++++++++++++++++++++++++++++++++++++++++ > tests/btrfs/333.out | 2 ++ > 4 files changed, 86 insertions(+), 25 deletions(-) > create mode 100755 tests/btrfs/333 > create mode 100644 tests/btrfs/333.out > > diff --git a/common/dmlogwrites b/common/dmlogwrites > index c1c85de9..f7faf244 100644 > --- a/common/dmlogwrites > +++ b/common/dmlogwrites > @@ -203,3 +203,27 @@ _log_writes_replay_log_range() > >> $seqres.full 2>&1 > [ $? -ne 0 ] && _fail "replay failed" > } > + > +# Replay and check each fua/flush (specified by $2) point. > +# > +# Since dm-log-writes records bio sequentially, even just replaying a range > +# still needs to iterate all records before the end point. > +# When number of records grows, it will be unacceptably slow, thus we need > +# to use relay-log itself to trigger fsck, avoid unnecessary seek. > +_log_writes_fast_replay_check() > +{ > + local check_point=$1 > + local blkdev=$2 > + local fsck_command=$3 > + local ret > + > + [ -z "$check_point" -o -z "$blkdev" ] && _fail \ > + "check_point and blkdev must be specified for log_writes_fast_replay_check" > + > + $here/src/log-writes/replay-log --log $LOGWRITES_DEV \ > + --replay $blkdev --check $check_point --fsck "$fsck_command" \ > + &> $tmp.full_fsck > + ret=$? > + tail -n 150 $tmp.full_fsck >> $seqres.full > + [ $ret -ne 0 ] && _fail "fsck failed during replay" > +} > diff --git a/tests/btrfs/192 b/tests/btrfs/192 > index f7fb65b8..449f0459 100755 > --- a/tests/btrfs/192 > +++ b/tests/btrfs/192 > @@ -96,30 +96,6 @@ delete_workload() > done > } > > -# Replay and check each fua/flush (specified by $2) point. > -# > -# Since dm-log-writes records bio sequentially, even just replaying a range > -# still needs to iterate all records before the end point. > -# When number of records grows, it will be unacceptably slow, thus we need > -# to use relay-log itself to trigger fsck, avoid unnecessary seek. > -log_writes_fast_replay_check() > -{ > - local check_point=$1 > - local blkdev=$2 > - local fsck_command="$BTRFS_UTIL_PROG check $blkdev" > - local ret > - > - [ -z "$check_point" -o -z "$blkdev" ] && _fail \ > - "check_point and blkdev must be specified for log_writes_fast_replay_check" > - > - $here/src/log-writes/replay-log --log $LOGWRITES_DEV \ > - --replay $blkdev --check $check_point --fsck "$fsck_command" \ > - &> $tmp.full_fsck > - ret=$? > - tail -n 150 $tmp.full_fsck >> $seqres.full > - [ $ret -ne 0 ] && _fail "fsck failed during replay" > -} > - > xattr_value=$(printf '%0.sX' $(seq 1 3800)) > > # Bumping tree height to level 2. > @@ -145,7 +121,7 @@ wait > _log_writes_unmount > _log_writes_remove > > -log_writes_fast_replay_check fua "$SCRATCH_DEV" > +_log_writes_fast_replay_check fua "$SCRATCH_DEV" "$BTRFS_UTIL_PROG check $SCRATCH_DEV" > > echo "Silence is golden" > > diff --git a/tests/btrfs/333 b/tests/btrfs/333 > new file mode 100755 > index 00000000..13f113ca > --- /dev/null > +++ b/tests/btrfs/333 > @@ -0,0 +1,59 @@ > +#! /bin/bash > +# SPDX-License-Identifier: GPL-2.0 > +# > +# FS QA Test 333 > +# > +# Test async dio with fsync to test a bug where a race meant that csums weren't > +# getting written to the log tree, causing corruptions on remount. This can be > +# seen on subpage FSes on 6.4. > +# > +. ./common/preamble > +_begin_fstest auto quick metadata log volume Why the volume group? The test isn't doing any volume management stuff. However it should be in the "recoveryloop" group. > + > +_fixed_by_kernel_commit e917ff56c8e7 \ > + "btrfs: determine synchronous writers from bio or writeback control" > + > +fio_config=$tmp.fio > + > +. ./common/dmlogwrites > + > +_require_scratch > +_require_log_writes > + > +cat >$fio_config <<EOF > +[global] > +iodepth=128 > +direct=1 > +ioengine=libaio > +rw=randwrite > +runtime=1s > +[job0] > +rw=randwrite > +filename=$SCRATCH_MNT/file > +size=1g > +fdatasync=1 > +EOF > + > +_require_fio $fio_config > + > +cat $fio_config >> $seqres.full > + > +_log_writes_init $SCRATCH_DEV > +_log_writes_mkfs >> $seqres.full 2>&1 > +_log_writes_mark mkfs > + > +_log_writes_mount > + > +$FIO_PROG $fio_config > /dev/null 2>&1 > +_log_writes_unmount > + > +_log_writes_remove > +_log_writes_replay_log mkfs $SCRATCH_DEV > + > +_log_writes_fast_replay_check fua "$SCRATCH_DEV" "$BTRFS_UTIL_PROG check $SCRATCH_DEV" Why do we need to do the replays twice? Once with _log_writes_replay_log mkfs and then again with _log_writes_fast_replay_check fua. There's also nothing in this test that is btrfs specific, it could be made a generic test instead. Thanks. > + > +echo "Silence is golden" > + > +# success, all done > +status=0 > +exit > diff --git a/tests/btrfs/333.out b/tests/btrfs/333.out > new file mode 100644 > index 00000000..60a15898 > --- /dev/null > +++ b/tests/btrfs/333.out > @@ -0,0 +1,2 @@ > +QA output created by 333 > +Silence is golden > -- > 2.44.2 > >