GFS panic: sm_membership.c

Poul Petersen <petersp@xxxxxxxxxx> · Fri, 8 Jul 2005 11:35:33 -0700



	I just recently started playing around with GFS and I'm trying
to get it working using AoE/Vblade to share a device. I originally tried
the GFS RPMs that came with FC4, but lock_dlm had a bunch of missing
symbols, so I reverted to using the cluster package from sources.redhat.com
Here is the setup:
	
	Two nodes:

gandolf: 192.168.1.16	(Yeah, I know it's spelled wrong)
	 Fedora Core 3
	 Kernel: 2.6.12.2
	 cluster-1.00.00
	 aoe-tools 4

jupiter: 192.168.1.20
	 Fedora Core 4
	 Kernel: 2.6.12-1.1387_FC4smp
	 cluster-1.00.00
	 vblade-5
	 (5) 250GB SATA HD in Software RAID5 (/dev/md0)
	 /dev/vg1/media: 500GB LV in vg1 (in /dev/md0)

cluster.conf:

<?xml version="1.0"?>
<cluster name="mythtv" config_version="3">

<cman two_node="1" expected_votes="1">
</cman>

<clusternodes>
<clusternode name="jupiter">
        <fence>
                <method name="single">
                        <device name="human" ipaddr="192.168.1.20"/>
                </method>
        </fence>
</clusternode>

<clusternode name="gandolf">
        <fence>
                <method name="single">
                        <device name="human" ipaddr="192.168.1.16"/>
                </method>
        </fence>
</clusternode>
</clusternodes>

<fencedevices>
        <fencedevice name="human" agent="fence_manual"/>
</fencedevices>

</cluster>

# Start the cluster services

[root@jupiter ~]# modprobe gfs
[root@jupiter ~]# modprobe lock_dlm
[root@jupiter ~]# ccsd
[root@jupiter ~]# cman_tool -w join
[root@jupiter ~]# fence_tool -w join

[root@gandolf ~]# modprobe gfs
[root@gandolf ~]# modprobe lock_dlm
[root@gandolf ~]# ccsd
[root@gandolf ~]# cman_tool -w join
[root@gandolf ~]# fence_tool -w join

# Create the filesystem and export it with AoE

[root@jupiter ~]# gfs_mkfs -p lock_dlm -t mythtv:media -j 2 /dev/vg1/media 
[root@jupiter ~]# /usr/local/build/vblade-5/vblade 0 0 eth0 /dev/vg1/media &

# Verify the device is available and do a test mount, then unmount

[root@gandolf ~]# modprobe aoe
[root@gandolf ~]# aoe-stat
    e0.0            eth0              up
[root@gandolf ~]# mount -t gfs /dev/etherd/e0.0 /san/media/
[root@gandolf ~]# df -k /san/media
Filesystem           1K-blocks      Used Available Use% Mounted on
/dev/etherd/e0.0     523969792       212 523969580   1% /san/media
[root@gandolf ~]# umount /san/media

# Test mount from the other node, this time leave it mounted

[root@jupiter ~]# mount -t gfs /dev/vg1/media /san/media
[root@jupiter ~]# df -k /san/media
Filesystem           1K-blocks      Used Available Use% Mounted on
/dev/mapper/vg1-media
                     523969792       212 523969580   1% /san/media

# Now try mounting on *both* nodes at the same time
[root@gandolf ~]# mount -t gfs /dev/etherd/e0.0 /san/media/

(from gandolf dmesg:)
GFS: Trying to join cluster "lock_dlm", "mythtv:media"
CMAN: removing node jupiter from the cluster : Missed too many heartbeats
dlm: media: dlm_dir_rebuild_local failed -1

	A this point, the mount command hangs and the other node
(jupiter in this case) panics with a message about an assertion
in line 106 of sm_membership.c. Whichever node mounts the 
filesystem second, panics the first. So close... Any thing
obvious that I am doing wrong? 

Many Thanks

-poul

--

Linux-cluster@xxxxxxxxxx
http://www.redhat.com/mailman/listinfo/linux-cluster