Add a test case that reproduces the xfs_bmap_extents_to_btree warnings that can be seen when a filesystem runs out of space while performing a dependent allocation. This test should be 100% reproducible on older kernels, prior to the AG-aware allocator re-write. The test runs a busy work job to keep another AG occupied in order to trigger the problem regardless of kernel version. However, this is only partially successful. On newer kernels, wiht the AG-aware allocator, this test now triggers the failure around 40-50% of the time on the author's test machine. Signed-off-by: Krister Johansen <kjlx@xxxxxxxxxxxxxxxxxx> --- tests/xfs/608 | 372 ++++++++++++++++++++++++++++++++++++++++++++++ tests/xfs/608.out | 2 + 2 files changed, 374 insertions(+) create mode 100755 tests/xfs/608 create mode 100644 tests/xfs/608.out diff --git a/tests/xfs/608 b/tests/xfs/608 new file mode 100755 index 00000000..9db8d21f --- /dev/null +++ b/tests/xfs/608 @@ -0,0 +1,372 @@ +#! /bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2024 YOUR NAME HERE. All Rights Reserved. +# +# FS QA Test 608 +# +# This test reproduces the xfs_bmap_extents_to_btree WARN that occurs when XFS +# fails to allocate a block for the new b-tree in the desired AG. This should +# reproduce the issue on kernels that predate the fix for the AG-aware extent +# allocator (< 6.3). +# +. ./common/preamble +_begin_fstest dangerous insert prealloc punch + +# Override the default cleanup function. + _cleanup() +{ + cd / + _destroy_loop_device $LOOP_DEV + rm -f $tmp.* $LOOP_FILE +} + +declare -a workers + +busy_work() +{ + while : + do + $XFS_IO_PROG -f -c "falloc 0 $BLOCK_SIZE" $BUSY_FILE + $XFS_IO_PROG -f -c "fpunch 0 $BLOCK_SIZE" $BUSY_FILE + done +} + +kill_busy_procs() +{ + for pid in ${workers[@]}; do + kill $pid + wait $pid + done + # Despite killing the workers and waiting for them to exit, we still + # sometimes get an EBUSY unmounting the loop device. Wait a second + # before returning to give lingering refcounts a chance to reach zero. + sleep 1 +} + +find_freesp() +{ + umount $LOOP_MNT + local freesp=$($XFS_DB_PROG $LOOP_DEV -c "agf 1" -c "print freeblks" \ + | awk '{print $3}') + mount -o nodiscard $LOOP_DEV $LOOP_MNT + echo $freesp +} + +find_biggest_freesp() +{ + umount $LOOP_MNT + local freesp=$($XFS_DB_PROG $LOOP_DEV -c 'agf 1' -c 'addr cntroot' \ + -c btdump | sed -rn 's/^[0-9]+:\[[0-9]+,([0-9]+)\].*/\1/p' \ + | tail -1) + mount -o nodiscard $LOOP_DEV $LOOP_MNT + echo $freesp +} + + +# Import common functions. +. ./common/filter + +# real QA test starts here + +# Modify as appropriate. +_supported_fs xfs +_require_test +_require_xfs_io_command "falloc" +_require_xfs_io_command "finsert" +_require_xfs_io_command "fpunch" + +# Require loop devices so that this test can create a small filesystem with +# a specific geometry that assists in making this easier to reproduce. +_require_loop + +LOOP_FILE=$TEST_DIR/$seq.img +LOOP_MNT=$TEST_DIR/$seq.mnt +mkdir -p $LOOP_MNT +$XFS_IO_PROG -ft -c "truncate 2g" $LOOP_FILE >> $seqres.full +LOOP_DEV=`_create_loop_device $LOOP_FILE` +loop_mkfs_addl_opts= + +$MKFS_XFS_PROG 2>&1 | grep -q rmapbt && \ + loop_mkfs_addl_opts="$loop_mkfs_addl_opts,rmapbt=0" +$MKFS_XFS_PROG 2>&1 | grep -q reflink && \ + loop_mkfs_addl_opts="$loop_mkfs_addl_opts,reflink=1" + +_mkfs_dev "-b size=4096 -m crc=1$loop_mkfs_addl_opts -d size=1708m,agcount=2" \ + " -l size=1986b" $LOOP_DEV >> $seqres.full +# nodiscard makes the unbusying of extents more predictable which makes the test +# repeatable. +_mount -o nodiscard $LOOP_DEV $LOOP_MNT + +BLOCK_SIZE=$(_get_file_block_size $LOOP_MNT) + +# Add a directory under the root of LOOP_MNT in order to ensure the files placed +# there end up in AG 1. + +TEST_SUBDIR=testdir/testsubdir +mkdir -p $LOOP_MNT/$TEST_SUBDIR + +# There are 3 files in this test. The first is allocated to consume most of the +# space in the AG. The test then punches holes in the file in order to allow +# the second file to allocate the fragmented blocks as individual extents. The +# second file is written such that it is only a couple of operations away from +# being converted from in-line extents to a b-tree in its bmbt. The third file +# simply exists for us to allocate spare blocks to in order to take the AG right +# up against ENOSPC. + +FILE1=$LOOP_MNT/$TEST_SUBDIR/file1 +FILE2=$LOOP_MNT/$TEST_SUBDIR/file2 +FILE3=$LOOP_MNT/$TEST_SUBDIR/file3 + +# BUSY_FILE is here to keep AG0 busy on versions of XFS where the allocator is +# allowed to check lower numbered AGs if it fails in a higher numbered one. +BUSY_FILE=$LOOP_MNT/testdir/busyfile + +# Calculate the number of extents we need in the cnobt and bnobt. This should +# be the maximum b-tree leaf size minus one, so that after adding two extents a +# split is triggered. + +# The test is currently set to always use CRC, otherwise this would be 56 if CRC +# and 16 if not. +alloc_block_len=56 +allocbt_leaf_maxrecs=$(((BLOCK_SIZE - alloc_block_len) / 8)) + +# Look at the attrfork offset in FILE2's inode in order to determine the number +# of extents before this splits to a b-tree. This test assumes a v5 filesystem +# so if forkoff is zero, it falls back to LITINO of 336 and uses a bmbt_rec_size +# of 16. +touch $FILE2 +file2_inode=$(stat -c '%i' "$FILE2") +# Temporarily unmount while parameters are gathered via xfs_db +umount $LOOP_MNT +forkoff=$($XFS_DB_PROG $LOOP_DEV -c "inode $file2_inode" \ + -c "print core.forkoff" | awk '{print $3}') +freeblks=$($XFS_DB_PROG $LOOP_DEV -c "agf 1" \ + -c "print freeblks" | awk '{print $3}') +mount -o nodiscard $LOOP_DEV $LOOP_MNT + +# We'll recreate FILE2 later. For now be as empty as we can. +rm $FILE2 + +# Some versions of xfs_db contain the agresv command, but not all do. +# Additionally, some parameters about how much space is actually allocatable +# aren't visible from xfs_db. Tracepoints have been helpful in figuring this +# out when developing the test by hand. Instead of trying to parse ftrace data +# and hope that the right tracepoints are available, brute force the actual +# allocatable maximum size by repeatedly trying to allocate larger offsets +# subtracted from $freeblks until one succeeds. +for (( i = 0, ag = 0; ; i++ )) +do + $XFS_IO_PROG -f -c "falloc 0 $(( (freeblks-i)*BLOCK_SIZE ))" $FILE1 + ag=$(xfs_bmap -v -n 1 $FILE1 | tail -1 | awk '{print $4}') + rm $FILE1 + (( ag == 1 )) && break +done + +# Let free'd extents unbusy +sleep 30 + +# At this point, $i is one larger than whatever the allocator thinks the maximum +# available space is. This is based upon the asssumption that the data +# allocation we made above set minleft = 1, so the allocation that finally fit +# into AG 1 has had any reservation withheld along with the space the allocator +# requested be withheld for any bmbt expansion. +freeblkseff=$((freeblks - i - 1)) +blocks_withheld=$((i+1)) + +iforksize=$((forkoff > 0 ? forkoff * 8 : 336)) +maxextents=$((iforksize / 16)) +# We'll need to allocate maxextents - 1 for this test, so that the last +# allocation to the file forces an extents -> b-tree conversion. +wanted_extents=$((maxextents - 1)) + +# The first allocation is just a big chunk into the first file. Grab the +# majority of the free blocks. +first_alloc_blocks=$((freeblkseff - 8192)) + +$XFS_IO_PROG -f -c "falloc 0 $((BLOCK_SIZE * first_alloc_blocks))" $FILE1 >> \ + $seqres.full + +# Insert space in the middle in order to help the allocator pick sequential +# blocks when we add space back later. If we don't do this, then it can break +# up larger extents instead of grabbing the ones we fpunch'd out. +# +# The insert offset was chosen arbitrarily and placed towards the beginning of +# the file for the conveinence of humans. The insert_blocks size needs to be +# larger than the space we'll later punch out of file 2 and insert back into +# file 1. This is on the order of 7 blocks, so 512 should always be large +# enough. +first_insert_offset=$((BLOCK_SIZE * 2083)) +first_insert_blocks=$((BLOCK_SIZE * 512)) + +$XFS_IO_PROG -f -c "finsert $first_insert_offset $first_insert_blocks" \ + $FILE1 >> $seqres.full + +# Punch 3-block holes into the file. This number was chosen so that we could +# re-allocate blocks from these chunks without causing the extent to get removed +# from the free-space btrees. +# +# Punch enough holes to ensure that the bnobt and cnobt end up two extents away +# from a b-tree split, and overshoot that value by the number we need to consume +# in the second file to end up with wanted_extents - 1. +num_holes=$(((allocbt_leaf_maxrecs-2) + (wanted_extents - 1))) +end_hole_offset=$((num_holes * 4 - 3)) +hole_blocks=$((BLOCK_SIZE * 3)) + +for i in $(seq 1 4 $end_hole_offset); do + $XFS_IO_PROG -f -c "fpunch $((BLOCK_SIZE * i)) $hole_blocks" \ + $FILE1 >> $seqres.full +done + +# Use the newly created holes to create extents in our victim file. The goal is +# to allocate up to the point of b-tree conversion minus 2. The remaining space +# is placed in the n-1 extent, and then the last is reserved for the split we +# trigger later. The holes are placed after a gap that's left towards the front +# of the file to allocate the rest of the space. This is done to get the +# allocator to give us the contiguous free chunk that would have previously been +# occupied by the per-AG reservation's free space. +alloc_hole_seq=$(((wanted_extents - 1) * 4 - 3)) + +# The offset for the placement of the holes needs to be after the remaining +# freespace chunk so calculate how big that needs to be first. We may need to +# recalculate this value to account for blocks freed from the AGFL later. +biggest_freesp=$(find_biggest_freesp) + +# 3x the biggest chunk of free blocks should be a big enough gap +hole_offset=$((biggest_freesp * 3)) + +for i in $(seq 1 4 $alloc_hole_seq); do + $XFS_IO_PROG -f \ + -c "falloc $((BLOCK_SIZE * (i+hole_offset))) $hole_blocks" \ + $FILE2 >> $seqres.full +done + +# Attempt to compensate for any late-breaking over/undershoot in the desired +# extent count by checking the number of extents in the bnobt and adding or +# removing space to try to arrive at the desired number. +umount $LOOP_MNT +current_extents=$($XFS_DB_PROG $LOOP_DEV -c 'agf 1' -c 'addr bnoroot' \ + -c 'btdump' | grep recs | sed -rn 's/^recs\[.*\-([0-9]+)\].*/\1/p' | \ + awk '{a +=int($1)} END{printf("%d\n", a);}') +mount -o nodiscard $LOOP_DEV $LOOP_MNT + +wanted_allocbt=$((allocbt_leaf_maxrecs-2)) +if [[ $current_extents -gt $wanted_allocbt ]]; then + ext_diff=$(( current_extents - wanted_allocbt )) + end_offset=$(( ext_diff * 4 - 3 )) + for i in $(seq 1 4 $end_offset); do + $XFS_IO_PROG -f -c "falloc $((BLOCK_SIZE * i)) $hole_blocks" \ + $FILE1 >> $seqres.full + done +elif [[ $current_extents -lt $wanted_allocbt ]]; then + ext_diff=$(( wanted_allocbt - current_extents )) + end_offset=$(( (ext_diff * 4 - 3) + end_hole_offset )) + for i in $(seq $end_hole_offset 4 $end_offset); do + $XFS_IO_PROG -f -c "fpunch $((BLOCK_SIZE * i )) $hole_blocks" \ + $FILE1 >> $seqres.full + done +fi + +# The previous falloc should have triggered a reverse-split of the freespace +# btrees. The next alloc should cause the freelist to be drained. Recompute +# the available freespace with the understanding that we'll need to do this +# again after the AGFL is trimmed by the next allocation. Leave a few blocks +# free so that we can use FILE3 to create the last needed set of free extents +# before triggering a split while simultaneously using the remaining space. +freesp_remaining=$(find_freesp) +f2_alloc_blocks=$((freesp_remaining - blocks_withheld - 10)) + +$XFS_IO_PROG -f -c "falloc 0 $((BLOCK_SIZE * f2_alloc_blocks))" \ + $FILE2 >> $seqres.full + +# Recompute the remaining blocks and let FILE3 consume the remainder of the +# space. This is intended to both leave one more free extent in the btrees and +# take us down to being right before ENOSPC. +freesp_remaining=$(find_freesp) +f3_alloc_blocks=$((freesp_remaining - blocks_withheld)) +biggest_freesp=$(find_biggest_freesp) + +# Due to variance outside of the control of the test, the remaining freespace +# may be broken into smaller chunks than it's possible to allocate in a single +# attempt. If the test tries to allocate one big chunk, that allocation will +# fail and consult the next AG. To prevent that from happening, check the size +# of the remaining freespace in AG1 and break this allocation into smaller +# chunks that a) consume space from AG1 and b) do not cause the extents we've +# carefully added to the freespace trees to get removed. +if [[ $f3_alloc_blocks -lt $biggest_freesp ]]; then + $XFS_IO_PROG -f -c "falloc 0 $((BLOCK_SIZE * f3_alloc_blocks))" \ + $FILE3 >> $seqres.full +else + alloc_left=$f3_alloc_blocks + alloc_blocks=$((biggest_freesp - 1)) + alloc_ofst=0 + while ((alloc_left > 0)); do + size=$((alloc_blocks * BLOCK_SIZE)) + $XFS_IO_PROG -f -c "falloc $alloc_ofst $size" \ + $FILE3 >> $seqres.full + alloc_left=$((alloc_left - alloc_blocks)) + alloc_ofst=$((alloc_ofst + (1000*BLOCK_SIZE))) + biggest_freesp=$(find_biggest_freesp) + if [[ $alloc_left -lt $biggest_freesp ]]; then + alloc_blocks=$alloc_left + else + alloc_blocks=$((biggest_freesp - 1)) + fi + done +fi + +# That's it for the setup. Now the test punches out a 12 block extent as one 6 +# block chunk in the middle, followed by two 3 block chunks on either side. It +# sleeps after the 6 block chunk so that portion of the extent will un-busy, but +# the 3 block chunks on either side stay (temporarily) unavailable. While the +# chunks on either side are busy, re-allocate some of the space that's been +# free'd back to FILE1 so that the final falloc to FILE2 brings us to ENOSPC. +f2_off=2560 +$XFS_IO_PROG -f -c "fpunch $((BLOCK_SIZE * f2_off)) $((BLOCK_SIZE * 6))" \ + $FILE2 >> $seqres.full +# Before we finish punching the final holes, start up some busy workers to keep +# the _other_ AG's locks contended. This isn't needed to reproduce the problem +# prior to the AG-aware allocator's arrival. Ncpu * 4 has been successful at +# reproducing the problem in places where a lower number of workers succeeds +# intermittently (or not at all). +ncpu=$(nproc) +for ((i=0 ; i < ncpu*4; i++)); do + busy_work & + workers[$i]=$! +done + +# Wait for first fpunch to unbusy, and then continue with remainder. +sleep 30 +$XFS_IO_PROG -f -c "fpunch $((BLOCK_SIZE * (f2_off + 6))) $((BLOCK_SIZE * 3))" \ + $FILE2 >> $seqres.full +$XFS_IO_PROG -f -c "fpunch $((BLOCK_SIZE * (f2_off - 3))) $((BLOCK_SIZE * 3))" \ + $FILE2 >> $seqres.full + +# Put 7 blocks back into FILE1 to consume some of the space free'd above. +# The offset here was picked so that the allocator takes blocks from the 3 block +# chunks we punched earlier, but leaves the extents intact in the freespace +# trees. +f1_off=1956 +f1_wanted_blocks=7 +f1_alloc_seq=$((f1_wanted_blocks * 4 - 3)) +for i in $(seq 1 4 $f1_alloc_seq); do + $XFS_IO_PROG -f -c "falloc $((BLOCK_SIZE * (i+f1_off))) $BLOCK_SIZE" \ + $FILE1 >> $seqres.full +done + +# This next falloc should result in FILE2's bmbt getting converted from extents +# to btree while simultaneously splitting the bnotbt and cnobt. The first +# allocation succeeds, splits the free space trees, consumes all the blocks in +# the agfl, and leaves us in a situation where the second allocation to convert +# from extents to a btree fails. +$XFS_IO_PROG -f -c "falloc $((BLOCK_SIZE * f2_off)) $((BLOCK_SIZE * 5))" \ + $FILE2 >> $seqres.full + +# Terminate the busy workers or else umount will return EBUSY. +kill_busy_procs + +umount $LOOP_MNT + +# success, all done +echo "Silence is golden" +status=0 +exit diff --git a/tests/xfs/608.out b/tests/xfs/608.out new file mode 100644 index 00000000..1e534458 --- /dev/null +++ b/tests/xfs/608.out @@ -0,0 +1,2 @@ +QA output created by 608 +Silence is golden -- 2.25.1