Hello,
I've got a call trace which should be fixed by "drop buffer io reference
when a bad bio is built" patch (http://patchwork.xfs.org/patch/3956/).
Error occured on already patched Linux kernel 3.2.42.
Test environment consist two machines target and initiator.
First machine works as target with QLogic Corp. ISP2432-based 4Gb Fibre
Channel device. Storage is placed on two KINGSTON SNV425S SSD working as
RAID0 array. RAID is managed by LSI MegaRAID SAS 1068 controller.
Second machine works as initiator with the same QLogic card.
After few days of running test script I got following call trace and XFS
stopped working.
[90012.196963] XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0,
file: fs/xfs/xfs_mount.c, line: 272
[90012.196982] ------------[ cut here ]------------
[90012.196984] kernel BUG at fs/xfs/xfs_message.c:101!
[90012.196987] invalid opcode: 0000 [#1] SMP
[90012.196990] CPU 2
[90012.196992] Modules linked in: iscsi_scst(O) scst_vdisk(O) libcrc32c
qla2x00tgt(O) scst(O) ext2 drbd(O) iscsi_tcp libiscsi_tcp libiscsi
scsi_transport_iscsi bonding qla2xxx(O) sg scsi_transport_fc
megaraid_sas bnx2 acpi_power_meter usbserial uhci_hcd ohci_hcd ehci_hcd
aufs [last unloaded: megaraid_sas]
[90012.197013]
[90012.197016] Pid: 10262, comm: mount Tainted: G O
3.2.42-oe64-00000-g12db8b5 #14 Dell Inc. PowerEdge R510/0DPRKF
[90012.197022] RIP: 0010:[<ffffffff812ff8ed>] [<ffffffff812ff8ed>]
assfail+0x1d/0x30
[90012.197031] RSP: 0000:ffff8800be43fc68 EFLAGS: 00010296
[90012.197034] RAX: 0000000000000071 RBX: ffff8800512e8cc0 RCX:
0000000000000046
[90012.197037] RDX: 0000000000000000 RSI: 0000000000000046 RDI:
ffffffff81c1c380
[90012.197039] RBP: ffff8800be43fc68 R08: 0000000000000006 R09:
000000000000ffff
[90012.197042] R10: 0000000000000006 R11: 000000000000000a R12:
ffff8800a41f4800
[90012.197045] R13: 0000000000000000 R14: ffff8800a41f49e8 R15:
ffff8800a41f49f8
[90012.197049] FS: 0000000000000000(0000) GS:ffff88012b240000(0063)
knlGS:00000000f75456c0
[90012.197052] CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b
[90012.197055] CR2: 000000000818e508 CR3: 000000001d9d7000 CR4:
00000000000006e0
[90012.197058] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[90012.197061] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[90012.197064] Process mount (pid: 10262, threadinfo ffff8800be43e000,
task ffff88007fe64a60)
[90012.197066] Stack:
[90012.197068] ffff8800be43fca8 ffffffff8134eb7a 0000000000000005
ffff8800a41f4800
[90012.197072] ffff8800a41f4818 0000000000000005 ffff8800a41f4800
0000000000000000
[90012.197076] ffff8800be43fcf8 ffffffff8135054d 0000000000000000
0000000000000000
[90012.197080] Call Trace:
[90012.197088] [<ffffffff8134eb7a>] xfs_free_perag+0x8a/0xc0
[90012.197092] [<ffffffff8135054d>] xfs_mountfs+0x31d/0x700
[90012.197097] [<ffffffff81301fab>] xfs_fs_fill_super+0x1cb/0x270
[90012.197103] [<ffffffff811476da>] mount_bdev+0x19a/0x1d0
[90012.197107] [<ffffffff81301de0>] ? xfs_fs_write_inode+0x180/0x180
[90012.197114] [<ffffffff8138b016>] ? selinux_sb_copy_data+0x156/0x1d0
[90012.197118] [<ffffffff81300200>] xfs_fs_mount+0x10/0x20
[90012.197123] [<ffffffff81146df1>] mount_fs+0x41/0x180
[90012.197129] [<ffffffff8115f7ae>] vfs_kern_mount+0x5e/0xc0
[90012.197133] [<ffffffff8116075e>] do_kern_mount+0x4e/0x100
[90012.197138] [<ffffffff81161f26>] do_mount+0x516/0x740
[90012.197144] [<ffffffff811064e9>] ? __get_free_pages+0x9/0x40
[90012.197150] [<ffffffff81187cc2>] compat_sys_mount+0xa2/0x220
[90012.197156] [<ffffffff8178be83>] ia32_do_call+0x13/0x13
[90012.197158] Code: 66 66 90 66 66 66 90 66 66 66 90 66 66 90 55 41 89
d0 48 89 f1 48 89 fa 48 c7 c6 b8 bf 9b 81 31 ff 48 89 e5 31 c0 e8 53 ff
ff ff <0f> 0b eb fe 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 55 31
[90012.197189] RIP [<ffffffff812ff8ed>] assfail+0x1d/0x30
[90012.197193] RSP <ffff8800be43fc68>
[90012.197196] ---[ end trace ed2e349225f77763 ]---
[90015.960377] XFS (dm-44): xfs_log_force: error 5 returned.
Test scripts gets as a parameter device name. It creates volume group,
two logical volumes with XFS filesystem. XFS is created with force
overwrite and lazy_count=0 options. Also 20 logical volumes are created
for snapshots. When test environment is prepared it runs two dd
processes in background. These processes copies 50 data blocks of size
100MB from /dev/zero to each logical volume. When 50 blocks are created
they are removed and whole operation starts again.
In an infinite loop LVs are converted to snapshots (10 snapshots for
each Logical Volume). After starting all snapshots they are removed one
by one.
test_script.sh source:
#!/bin/bash
DEV=$1
if [ -z $DEV ]; then
echo "This program requires device name as parameter"
exit 1
fi
function overload()
{
COUNT=$1
temp_COUNT=$COUNT;
while [ -f ./run ]; do
while [ $COUNT -ge 1 ]; do
if [ -f ./run ]; then
dd bs=1024 count=102400 if=/dev/zero of=/$2/"_"$COUNT &>
/dev/null
fi;
let COUNT=$COUNT-1
done;
rm $2/*;
COUNT=$temp_COUNT;
done;
}
function create_vg()
{
#create physical volume
pvcreate /dev/sda
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to create physical volume"
exit 1
fi
#create volume group
vgcreate -v -s 32M vg0 /dev/sda
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to create volume group"
exit 1
fi
VG="vg0"
}
function create_lv()
{
local LV="$1"
#create logical volume
lvcreate -l 500 -n "$VG+$LV" "$VG"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to create LV"
exit 1
fi
mkfs -t xfs -f -l lazy-count=0 /dev/$VG/"$VG+$LV" &>/dev/null
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Can't create filesystem"
exit 1
fi
}
function create_snapshots()
{
for ((i=0; i < 20; i++)); do
if [[ $i -lt 10 ]]; then
lvcreate -l "64" -n "snap0$i" "$VG"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to create snapshot LV"
exit 1
fi
else
lvcreate -l "64" -n "snap$i" "$VG"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to create snapshot LV"
exit 1
fi
fi
done
}
function assign_snapshots()
{
for ((i=0; i < 20; i++)); do
if [[ $i -lt 10 ]]; then
lvrename "$VG" "snap0$i" "lv0+snap0$i"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to rename snapshot LV"
exit 1
fi
else
lvrename "$VG" "snap$i" "lv1+snap$i"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to rename snapshot LV"
exit 1
fi
fi
done
}
function mount_volume()
{
local MVG=$1
local MLV=$2
mkdir -p "/test/mount/$MVG+$MLV"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to create mounting point"
exit 1
fi
mount -t xfs -o
defaults,usrquota,grpquota,nouuid,noatime,nodiratime
"/dev/$MVG/$MVG+$MLV" "/test/mount/$MVG+$MLV"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Unable to mount LV"
exit 1
fi
}
function start_overload()
{
touch ./run
mkdir -p /test/mount/$1+$2/test
overload 50 "/test/mount/$1+$2/test" $3 &
echo "overload $1 /test/mount/$1+$2/test $3 &"
sleep 4;
echo "[ OK ] copying files to $2 started"
}
function get_snapshot_status()
{
lvdisplay /dev/$1/$2 | awk ' $0~"LV snapshot status" { print $4 } '
}
function remove_snapshot()
{
local LVG=$1
local LLV=$2
local LSNAP=$3
umount "/test/mount/$LSNAP"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Can't umount snapshot"
fi
lvremove -sf "/dev/$LVG/$LSNAP"
if [[ $? -gt 0 ]]; then
echo "[ FAIL ] Can't remove snapshot"
fi
}
function create_snapshot()
{
local LVG=$1
local LLV=$2
local LSNAP=$3
for((it=0; it<7; it++)); do
local ERROR=0
local STATUS=`get_snapshot_status $LVG $LSNAP`
if [[ "$STATUS" == "active" ]]; then
remove_snapshot $LVG "$LLV+$LSNAP"
fi
STATUS=`get_snapshot_status $LVG $LSNAP`
if [[ "$STATUS" == "active" ]]; then
remove_snapshot $LVG "$LLV+$LSNAP"
fi
CHUNKSIZE=512
for ((ile=0;ile<it/2;ile++)); do
CHUNKSIZE=$((CHUNKSIZE/2))
done
lvconvert -s -c $CHUNKSIZE "/dev/$LVG/$LVG+$LLV" "/dev/$LVG/$LSNAP"
if [[ $? -gt 0 ]]; then
ERROR=1
fi
#mount snapshot
mkdir -p "/test/mount/$LSNAP"
mount -t xfs -o nouuid,noatime "/dev/$LVG/$LSNAP"
"/test/mount/$LSNAP"
if [[ $? -gt 0 ]]; then
ERROR=2
fi
create_time=`date "+%Y-%m-%d %H:%M:%S"`
if [ $ERROR -ne 0 ]; then
remove_snapshot $LVG $LLV $LSNAP
sleep 5
else
break;
fi
done
}
function start_snap()
{
local i;
for((i=0; i<20; i++)); do
echo "Starting snap$i : `date`"
local START=`date +%s`
if [[ $i -lt 10 ]]; then
snapname="lv0+snap0"$i
create_snapshot $VG "lv0" $snapname
else
snapname="lv1+snap"$i
create_snapshot $VG "lv1" $snapname
fi
if [ -z "`lvs | grep $snapname | grep $VG+lv`" ]; then
echo "[ FAIL ] $snapname not activated !!!"
else
echo "[ OK ] $snapname activated."
fi
if [ -z "`mount | grep $snapname`" ]; then
echo "[ FAIL ] $snapname not mounted !!!" >> $LOGFILE
else
echo "[ OK ] $snapname mounted."
fi
local STOP=$[`date +%s`-$START]
echo "Starting time : $STOP s."
echo "---------------------------"
sleep 2
done;
}
function stop_snap()
{
local i
for((i=0; i<20; i++)); do
echo "Stopping snap$i : `date`"
local START=`date +%s`
if [[ $i -lt 10 ]]; then
snapname="lv0+snap0"$i
remove_snapshot $VG "lv0" $snapname
else
snapname="lv1+snap"$i
remove_snapshot $VG "lv1" $snapname
fi
if [ "`lvs | grep $snapname | grep $VG+lv`" ]; then
echo "[ FAIL ] $snapname still active !!!"
else
echo "[ OK ] $snapname deactivated."
fi;
if [ "`mount | grep $snapname`" ]; then
echo "[ FAIL ] $snapname still mounted !!!" >> $LOGFILE
else
echo "[ OK ] $snapname umounted."
fi;
local STOP=$[`date +%s`-$START]
echo "Stopping time : $STOP s."
echo "---------------------------"
sleep 2
done;
}
echo "-------- Creating vg0 on $DEV..."
create_vg
echo "[ OK ] Volume group created successfully"
echo "-------- Creating logical volumes on $VG..."
create_lv "lv0"
create_lv "lv1"
echo "[ OK ] Logical volumes created successfully"
echo "-------- Mounting logical volumes..."
mount_volume "$VG" "lv0"
mount_volume "$VG" "lv1"
echo "[ OK ] Logical volumes mounted successfully"
echo "-------- Creating snapshots..."
create_snapshots
echo "[ OK ] Snapshots created successfully"
echo "-------- Assigning snapshots..."
assign_snapshots
echo "[ OK ] Snapshots assigned successfully"
echo "-------- Start overload..."
start_overload "vg0" "lv0"
start_overload "vg0" "lv1"
while true; do
start_snap 2> /dev/null
stop_snap 2> /dev/null
done
rm ./run
--
Best regards
Arkadiusz Bubała
_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs