Hello, we're running test script that starts and stops snapshots in a loop while overfilling them. After a few days of running system hangs. We've captured following call trace: [116649.755761] XFS (dm-42): metadata I/O error: block 0xfa2b06 ("xlog_iodone") error 5 buf count 1024 [116649.947247] XFS (dm-42): Log I/O Error Detected. Shutting down filesystem [116650.073881] XFS (dm-42): Please umount the filesystem and rectify the problem(s) [116650.207186] BUG: unable to handle kernel paging request at 00000000000010a8 [116650.335185] IP: [<ffffffff8102e1d6>] __ticket_spin_lock+0x6/0x20 [116650.451052] PGD 0 [116650.518151] Oops: 0002 [#1] SMP [116650.599477] CPU 0 [116650.622838] Modules linked in: iscsi_scst(O) scst_vdisk(O) scst(O) drbd(O) twofish_x86_64 twofish_generic twofish_common serpent_sse2_x86_64 lrw xts gf1] [116651.479730] [116651.540674] Pid: 30173, comm: kworker/0:5 Tainted: G O 3.4.63-oe64-00000-g1a33902 #38 Intel Corporation S1200BTL/S1200BTL [116651.772395] RIP: 0010:[<ffffffff8102e1d6>] [<ffffffff8102e1d6>] __ticket_spin_lock+0x6/0x20 [116651.921067] RSP: 0018:ffff88010a9c7e30 EFLAGS: 00010246 [116652.031355] RAX: 0000000000000100 RBX: 00000000000010a8 RCX: 0000000000000000 [116652.163291] RDX: 0000000000000092 RSI: 0000000000000002 RDI: 00000000000010a8 [116652.294384] RBP: ffff88022d424c00 R08: 0000000000000000 R09: ffff8802334ba740 [116652.425190] R10: 0000000000000000 R11: ffffffff812c24b0 R12: 0000000000001000 [116652.555680] R13: 0000000000000002 R14: 0000000000000000 R15: ffff880237017500 [116652.685235] FS: 0000000000000000(0000) GS:ffff880237000000(0000) knlGS:0000000000000000 [116652.825924] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [116652.937639] CR2: 00000000000010a8 CR3: 0000000001874000 CR4: 00000000000407f0 [116653.066047] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [116653.193821] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [116653.320481] Process kworker/0:5 (pid: 30173, threadinfo ffff88010a9c6000, task ffff880233f7f0d0) [116653.466833] Stack: [116653.530557] ffffffff815f4b45 ffffffff812685f2 ffff88022f97b180 ffff88023700e300 [116653.661317] ffff880015def170 ffff88023700e180 ffff88022f97b180 ffffffff81051843 [116653.792457] 000000062f97b1a8 ffff88022f97b180 ffff88023700e180 ffff88022f97b1a8 [116653.923833] Call Trace: [116653.995006] [<ffffffff815f4b45>] ? _raw_spin_lock+0x5/0x10 [116654.103462] [<ffffffff812685f2>] ? xlog_state_done_syncing+0x32/0xc0 [116654.221716] [<ffffffff81051843>] ? process_one_work+0xf3/0x320 [116654.333195] [<ffffffff810534f2>] ? worker_thread+0xe2/0x280 [116654.441031] [<ffffffff81053410>] ? gcwq_mayday_timeout+0x80/0x80 [116654.553512] [<ffffffff8105776b>] ? kthread+0x9b/0xb0 [116654.652866] [<ffffffff815fbde4>] ? kernel_thread_helper+0x4/0x10 [116654.764568] [<ffffffff810576d0>] ? kthread_bind+0x80/0x80 [116654.869162] [<ffffffff815fbde0>] ? gs_change+0x13/0x13 [116654.970752] Code: 13 ff ff ff 48 c7 c2 21 e0 02 81 48 c7 c1 24 e0 02 81 e9 00 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 31 c0 30 c0 b4 01<f0> [116655.320953] RIP [<ffffffff8102e1d6>] __ticket_spin_lock+0x6/0x20 [116655.432251] RSP<ffff88010a9c7e30> [116655.511021] CR2: 00000000000010a8 [116655.586770] ---[ end trace 109ecd45394d669c ]--- [116655.678353] BUG: unable to handle kernel paging request at fffffffffffffff8 [116655.798563] IP: [<ffffffff81057437>] kthread_data+0x7/0x10 [116655.900375] PGD 1876067 PUD 1877067 PMD 0 [116655.984863] Oops: 0000 [#2] SMP [116656.057855] CPU 0 [116656.081214] Modules linked in: iscsi_scst(O) scst_vdisk(O) scst(O) drbd(O) twofish_x86_64 twofish_generic [116656.243502] XFS (dm-47): xfs_log_force: error 5 returned. [116656.400665] twofish_common serpent_sse2_x86_64 lrw xts gf128mul serpent_generic blowfish_x86_64 blowfish_generic blowfish_common cast5 sha512_generic s] [116656.599500] [116656.599503] Pid: 30173, comm: kworker/0:5 Tainted: G D O 3.4.63-oe64-00000-g1a33902 #38 Intel Corporation S1200BTL/S1200BTL [116656.599505] RIP: 0010:[<ffffffff81057437>] [<ffffffff81057437>] kthread_data+0x7/0x10 [116656.599510] RSP: 0018:ffff88010a9c7a40 EFLAGS: 00010002 [116656.599511] RAX: 0000000000000000 RBX: ffff880233f7f368 RCX: ffffffff81a21500 [116656.599512] RDX: 000005f8a818d26c RSI: 0000000000000000 RDI: ffff880233f7f0d0 [116656.599513] RBP: 0000000000000000 R08: 0000000000000001 R09: ffffea000742d440 [116656.599514] R10: 0000000000000400 R11: ffffffff81068c40 R12: ffff880233f7f0d0 [116656.599515] R13: ffff880235c90000 R14: ffff880237012180 R15: 0000000000000000 [116656.599516] FS: 0000000000000000(0000) GS:ffff880237000000(0000) knlGS:0000000000000000 [116656.599517] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [116656.599518] CR2: fffffffffffffff8 CR3: 0000000001874000 CR4: 00000000000407f0 [116656.599519] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [116656.599521] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [116656.599522] Process kworker/0:5 (pid: 30173, threadinfo ffff88010a9c6000, task ffff880233f7f0d0) [116656.599523] Stack: [116656.599523] ffffffff810530fb ffff880233f7f368 ffff880233f7f0d0 ffff880233f7f0d0 [116656.599525] ffffffff815f3ee2 ffff88010a9c7ad8 ffff880236810240 ffff8801f7f5c930 [116656.599526] ffff8801f7f5c900 000000010a9c7ae8 ffffffff810fe66e ffff880231cf75c0 [116656.599528] Call Trace: [116656.599530] [<ffffffff810530fb>] ? wq_worker_sleeping+0xb/0x70 [116656.599533] [<ffffffff815f3ee2>] ? __schedule+0x372/0x600 [116656.599535] [<ffffffff810fe66e>] ? d_lookup+0x2e/0x60 [116656.599538] [<ffffffff8102e255>] ? default_spin_lock_flags+0x5/0x10 [116656.599540] [<ffffffff810413f1>] ? do_exit+0x5d1/0x8e0 [116656.599542] [<ffffffff812c24b0>] ? __const_udelay+0x40/0x40 [116656.599544] [<ffffffff815f4b7b>] ? _raw_spin_lock_irqsave+0x2b/0x50 [116656.599545] [<ffffffff8103db5e>] ? kmsg_dump+0x6e/0x120 [116656.599547] [<ffffffff815f5dba>] ? oops_end+0xea/0xf0 [116656.599549] [<ffffffff81030a94>] ? no_context+0x1c4/0x2d0 [116656.599551] [<ffffffff815f8371>] ? do_page_fault+0x2b1/0x500 [116656.599552] [<ffffffff8103e6de>] ? printk+0x4e/0x60 [116656.599555] [<ffffffff8105fb97>] ? check_preempt_curr+0x57/0x80 [116656.599557] [<ffffffff8105f21b>] ? __wake_up_common+0x5b/0x90 [116656.599559] [<ffffffff81227465>] ? xfs_alert_tag+0x75/0xa0 [116656.599561] [<ffffffff815f51a5>] ? page_fault+0x25/0x30 [116656.599563] [<ffffffff812c24b0>] ? __const_udelay+0x40/0x40 [116656.599565] [<ffffffff8102e1d6>] ? __ticket_spin_lock+0x6/0x20 [116656.599566] [<ffffffff815f4b45>] ? _raw_spin_lock+0x5/0x10 [116656.599568] [<ffffffff812685f2>] ? xlog_state_done_syncing+0x32/0xc0 [116656.599571] [<ffffffff81051843>] ? process_one_work+0xf3/0x320 [116656.599572] [<ffffffff810534f2>] ? worker_thread+0xe2/0x280 [116656.599574] [<ffffffff81053410>] ? gcwq_mayday_timeout+0x80/0x80 [116656.599575] [<ffffffff8105776b>] ? kthread+0x9b/0xb0 [116656.599577] [<ffffffff815fbde4>] ? kernel_thread_helper+0x4/0x10 [116656.599579] [<ffffffff810576d0>] ? kthread_bind+0x80/0x80 [116656.599581] [<ffffffff815fbde0>] ? gs_change+0x13/0x13 [116656.599581] Code: fe ff ff 90 65 48 8b 04 25 80 c5 00 00 48 8b 80 40 02 00 00 8b 40 f0 c3 66 66 66 90 66 66 66 90 66 66 66 90 48 8b 87 40 02 00 00<48> [116656.599592] RIP [<ffffffff81057437>] kthread_data+0x7/0x10 [116656.599594] RSP<ffff88010a9c7a40> [116656.599595] CR2: fffffffffffffff8 [116656.599596] ---[ end trace 109ecd45394d669d ]--- [116656.599597] Fixing recursive fault but reboot is needed! [116669.563245] XFS (dm-51): xfs_log_force: error 5 returned. It looks like a race condition. Test script source: #!/bin/bash DEV=$1 if [ -z $DEV ]; then echo "This program requires device name as parameter" exit 1 fi function overload() { COUNT=$1 temp_COUNT=$COUNT; while [ -f ./run ]; do while [ $COUNT -ge 1 ]; do if [ -f ./run ]; then dd bs=1024 count=102400 if=/dev/zero of=/$2/"_"$COUNT&> /dev/null fi; let COUNT=$COUNT-1 done; rm $2/*; COUNT=$temp_COUNT; done; } function create_vg() { #create physical volume pvcreate /dev/sda if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to create physical volume" exit 1 fi #create volume group vgcreate -v -s 32M vg0 /dev/sda if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to create volume group" exit 1 fi VG="vg0" } function create_lv() { local LV="$1" #create logical volume lvcreate -l 500 -n "$VG+$LV" "$VG" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to create LV" exit 1 fi mkfs -t xfs -f -l lazy-count=0 /dev/$VG/"$VG+$LV"&>/dev/null if [[ $? -gt 0 ]]; then echo "[ FAIL ] Can't create filesystem" exit 1 fi } function create_snapshots() { for ((i=0; i< 20; i++)); do if [[ $i -lt 10 ]]; then lvcreate -l "64" -n "snap0$i" "$VG" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to create snapshot LV" exit 1 fi else lvcreate -l "64" -n "snap$i" "$VG" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to create snapshot LV" exit 1 fi fi done } function assign_snapshots() { for ((i=0; i< 20; i++)); do if [[ $i -lt 10 ]]; then lvrename "$VG" "snap0$i" "lv0+snap0$i" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to rename snapshot LV" exit 1 fi else lvrename "$VG" "snap$i" "lv1+snap$i" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to rename snapshot LV" exit 1 fi fi done } function mount_volume() { local MVG=$1 local MLV=$2 mkdir -p "/test/mount/$MVG+$MLV" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to create mounting point" exit 1 fi mount -t xfs -o defaults,usrquota,grpquota,nouuid,noatime,nodiratime "/dev/$MVG/$MVG+$MLV" "/test/mount/$MVG+$MLV" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Unable to mount LV" exit 1 fi } function start_overload() { touch ./run mkdir -p /test/mount/$1+$2/test overload 50 "/test/mount/$1+$2/test" $3& echo "overload $1 /test/mount/$1+$2/test $3&" sleep 4; echo "[ OK ] copying files to $2 started" } function get_snapshot_status() { lvdisplay /dev/$1/$2 | awk ' $0~"LV snapshot status" { print $4 } ' } function remove_snapshot() { local LVG=$1 local LLV=$2 local LSNAP=$3 umount "/test/mount/$LSNAP" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Can't umount snapshot" fi lvremove -sf "/dev/$LVG/$LSNAP" if [[ $? -gt 0 ]]; then echo "[ FAIL ] Can't remove snapshot" fi } function create_snapshot() { local LVG=$1 local LLV=$2 local LSNAP=$3 for((it=0; it<7; it++)); do local ERROR=0 local STATUS=`get_snapshot_status $LVG $LSNAP` if [[ "$STATUS" == "active" ]]; then remove_snapshot $LVG "$LLV+$LSNAP" fi STATUS=`get_snapshot_status $LVG $LSNAP` if [[ "$STATUS" == "active" ]]; then remove_snapshot $LVG "$LLV+$LSNAP" fi CHUNKSIZE=512 for ((ile=0;ile<it/2;ile++)); do CHUNKSIZE=$((CHUNKSIZE/2)) done lvconvert -s -c $CHUNKSIZE "/dev/$LVG/$LVG+$LLV" "/dev/$LVG/$LSNAP" if [[ $? -gt 0 ]]; then ERROR=1 fi #mount snapshot mkdir -p "/test/mount/$LSNAP" mount -t xfs -o nouuid,noatime "/dev/$LVG/$LSNAP" "/test/mount/$LSNAP" if [[ $? -gt 0 ]]; then ERROR=2 fi create_time=`date "+%Y-%m-%d %H:%M:%S"` if [ $ERROR -ne 0 ]; then remove_snapshot $LVG $LLV $LSNAP sleep 5 else break; fi done } function start_snap() { local i; for((i=0; i<20; i++)); do echo "Starting snap$i : `date`" local START=`date +%s` if [[ $i -lt 10 ]]; then snapname="lv0+snap0"$i create_snapshot $VG "lv0" $snapname else snapname="lv1+snap"$i create_snapshot $VG "lv1" $snapname fi if [ -z "`lvs | grep $snapname | grep $VG+lv`" ]; then echo "[ FAIL ] $snapname not activated !!!" else echo "[ OK ] $snapname activated." fi if [ -z "`mount | grep $snapname`" ]; then echo "[ FAIL ] $snapname not mounted !!!">> $LOGFILE else echo "[ OK ] $snapname mounted." fi local STOP=$[`date +%s`-$START] echo "Starting time : $STOP s." echo "---------------------------" sleep 2 done; } function stop_snap() { local i for((i=0; i<20; i++)); do echo "Stopping snap$i : `date`" local START=`date +%s` if [[ $i -lt 10 ]]; then snapname="lv0+snap0"$i remove_snapshot $VG "lv0" $snapname else snapname="lv1+snap"$i remove_snapshot $VG "lv1" $snapname fi if [ "`lvs | grep $snapname | grep $VG+lv`" ]; then echo "[ FAIL ] $snapname still active !!!" else echo "[ OK ] $snapname deactivated." fi; if [ "`mount | grep $snapname`" ]; then echo "[ FAIL ] $snapname still mounted !!!">> $LOGFILE else echo "[ OK ] $snapname umounted." fi; local STOP=$[`date +%s`-$START] echo "Stopping time : $STOP s." echo "---------------------------" sleep 2 done; } echo "-------- Creating vg0 on $DEV..." create_vg echo "[ OK ] Volume group created successfully" echo "-------- Creating logical volumes on $VG..." create_lv "lv0" create_lv "lv1" echo "[ OK ] Logical volumes created successfully" echo "-------- Mounting logical volumes..." mount_volume "$VG" "lv0" mount_volume "$VG" "lv1" echo "[ OK ] Logical volumes mounted successfully" echo "-------- Creating snapshots..." create_snapshots echo "[ OK ] Snapshots created successfully" echo "-------- Assigning snapshots..." assign_snapshots echo "[ OK ] Snapshots assigned successfully" echo "-------- Start overload..." start_overload "vg0" "lv0" start_overload "vg0" "lv1" while true; do start_snap 2> /dev/null stop_snap 2> /dev/null done rm ./run -- Best regards Arkadiusz Bubała Open-E Poland Sp. z o.o. www.open-e.com _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs