I've got an otherwise working fine two node + qdisk cluster3 (3.0.0)
setup running under Debian with 2.6.30 kern. In the past it has fenced
and failed over properly to recover from a failed node.
But, yesterday one of the status checks returned a 1 and the subsequent
automatic start/stop of the service also returned non-good. This set my
cluster service into a 'failed' state and all related components were
stopped. Everything was resolved with a manual service disable and enable.
Should the secondary have fenced in this case or is that reserved for
only when communications in the cluster fail? I would have thought that
it would have tried to start the service at least. A clustat on either
machine showed the service "failed' and nothing was logged on the
non-active node.
Since a failover (rather then a give up) would be the proper thing, I'm
assuming a config issue. Any pointers?
<?xml version="1.0"?>
<cluster name="alpha" config_version="42">
<cman two_node="0" expected_votes="3">
</cman>
<clusternodes>
<clusternode name="wonder-p" votes="1" nodeid="1">
<fence>
<method name="single">
<device name="pwr01" option="off"/>
<device name="pwr02" option="off"/>
<device name="pwr01" option="on"/>
<device name="pwr02" option="on"/>
</method>
</fence>
</clusternode>
<clusternode name="nicks-p" votes="1" nodeid="2">
<fence>
<method name="single">
<device name="pwr03" option="off"/>
<device name="pwr04" option="off"/>
<device name="pwr03" option="on"/>
<device name="pwr04" option="on"/>
</method>
</fence>
</clusternode>
</clusternodes>
<quorumd interval="1" tko="10" votes="1" label="quorumdisk">
<heuristic program="ping 172.25.19.254 -c1 -t1" score="1"
interval="2" tko="3"/>
</quorumd>
<fence_daemon post_join_delay="20">
</fence_daemon>
<fencedevices>
<fencedevice agent="fence_apc_snmp" ipaddr="pdu-paul-2-2"
port="4" name="pwr01" udpport="161" />
<fencedevice agent="fence_apc_snmp" ipaddr="pdu-paul-2-3"
port="4" name="pwr02" udpport="161" />
<fencedevice agent="fence_apc_snmp" ipaddr="pdu-paul-2-2"
port="3" name="pwr03" udpport="161" />
<fencedevice agent="fence_apc_snmp" ipaddr="pdu-paul-2-3"
port="3" name="pwr04" udpport="161" />
</fencedevices>
<rm>
<failoverdomains>
<failoverdomain name="mailcluster" restricted="1" ordered="0" >
<failoverdomainnode name="wonder-p" priority="1"/>
<failoverdomainnode name="nicks-p" priority="1"/>
</failoverdomain>
</failoverdomains>
<service name="MailHost" autostart="1" domain="mailcluster" >
<script name="MailHost-early"
file="/etc/cluster/MailHost-misc-early" />
<fs name="mailhome" mountpoint="/home" device="LABEL=home"
fstype="ext4" force_unmount="1" active_monitor="1"
options="defaults,noatime,nodiratime" />
<fs name="mailcluster" mountpoint="/var/cluster"
device="LABEL=cluster" fstype="ext3" force_unmount="1"
active_monitor="1" options="defaults" />
<ip address="172.25.16.58" monitor_link="1" />
<script name="saslauthd" file="/etc/cluster/saslauthd-cluster" />
<script name="postfix" file="/etc/cluster/postfix-cluster" />
<script name="dovecot" file="/etc/cluster/dovecot-wrapper"
__independent_subtree="1" />
<script name="mailman" file="/etc/cluster/mailman-wrapper"
__independent_subtree="1" />
<script name="apache2-mailhost"
file="/etc/cluster/apache2-mailhost" __independent_subtree="1" />
<script name="usermin" file="/etc/init.d/usermin"
__independent_subtree="1" />
<script name="MailHost-late"
file="/etc/cluster/MailHost-misc-late" />
</service>
</rm>
</cluster>
Dec 15 12:37:00 bash Executing /etc/cluster/postfix-cluster status
Dec 15 12:37:00 bash Executing /etc/cluster/dovecot-wrapper status
Dec 15 12:37:00 bash Executing /etc/cluster/mailman-wrapper status
Dec 15 12:37:00 bash Executing /etc/cluster/apache2-mailhost status
Dec 15 12:37:00 bash Executing /etc/init.d/usermin status
Dec 15 12:37:00 bash script:usermin: status of /etc/init.d/usermin
failed (return
ed 1)
Dec 15 12:37:01 bash Executing /etc/cluster/MailHost-misc-late status
Dec 15 12:37:01 bash Executing /etc/init.d/usermin stop
Dec 15 12:37:03 bash Executing /etc/init.d/usermin start
Dec 15 12:37:19 bash script:usermin: start of /etc/init.d/usermin failed
(returne
d 98)
Dec 15 12:37:20 bash Executing /etc/cluster/MailHost-misc-late stop
Dec 15 12:37:21 bash Executing /etc/init.d/usermin stop
Dec 15 12:37:21 bash script:usermin: stop of /etc/init.d/usermin failed
(returned
1)
Dec 15 12:37:21 bash Executing /etc/cluster/apache2-mailhost stop
Dec 15 12:37:24 bash Executing /etc/cluster/mailman-wrapper stop
Dec 15 12:37:42 bash Executing /etc/cluster/dovecot-wrapper stop
Dec 15 12:37:43 bash Executing /etc/cluster/postfix-cluster stop
Dec 15 12:37:56 bash Executing /etc/cluster/saslauthd-cluster stop
Dec 15 12:38:07 bash Executing /etc/cluster/MailHost-misc-early stop
Dec 15 12:38:08 bash Removing IPv4 address 172.25.16.58/22 from eth0
Dec 15 12:38:21 bash unmounting /var/cluster
Dec 15 12:38:21 bash Forcefully unmounting /var/cluster
Dec 15 12:38:22 bash killing process 6844 (daemon atd /var/cluster)
Dec 15 12:38:22 bash killing process 4274 (root bash /var/cluster)
Dec 15 12:38:22 bash killing process 6836 (root cron /var/cluster)
Dec 15 12:38:30 bash unmounting /var/cluster
Dec 15 12:38:32 bash unmounting /home
Dec 15 12:38:32 bash Forcefully unmounting /home
Dec 15 12:38:33 bash killing process 27678 (root bacula-fd /home)
Dec 15 12:38:41 bash unmounting /home
Dec 15 12:50:08 bash Executing /etc/cluster/MailHost-misc-late stop
Dec 15 12:50:08 bash Executing /etc/init.d/usermin stop
Dec 15 12:50:08 bash script:usermin: stop of /etc/init.d/usermin failed
(returned
1)
Dec 15 12:50:08 bash Executing /etc/cluster/apache2-mailhost stop
Dec 15 12:50:09 bash Executing /etc/cluster/mailman-wrapper stop
Dec 15 12:50:09 bash script:mailman: stop of
/etc/cluster/mailman-wrapper failed
(returned 1)
Dec 15 12:50:09 bash Executing /etc/cluster/dovecot-wrapper stop
Dec 15 12:50:09 bash Executing /etc/cluster/postfix-cluster stop
Dec 15 12:50:09 bash Executing /etc/cluster/saslauthd-cluster stop
Dec 15 12:50:10 bash Executing /etc/cluster/MailHost-misc-early stop
Dec 15 12:50:10 bash 172.25.16.58 is not configured
Dec 15 12:50:10 bash /dev/dm-1 is not mounted
Dec 15 12:50:10 bash /dev/dm-0 is not mounted
Dec 15 12:50:20 bash Unknown file system type 'ext4' for device
/dev/dm-0. Assum
ing fsck is required.
Dec 15 12:50:20 bash Running fsck on /dev/dm-0
Dec 15 12:50:21 bash mounting /dev/dm-0 on /home
Dec 15 12:50:21 bash mount -t ext4 -o defaults,noatime,nodiratime
/dev/dm-0 /home
Dec 15 12:50:22 bash quotaon not found in /bin:/sbin:/usr/bin:/usr/sbin
Dec 15 12:50:22 bash mounting /dev/dm-1 on /var/cluster
Dec 15 12:50:23 bash mount -t ext3 -o defaults /dev/dm-1 /var/cluster
Dec 15 12:50:23 bash quotaon not found in /bin:/sbin:/usr/bin:/usr/sbin
Dec 15 12:50:23 bash Link for eth0: Detected
Dec 15 12:50:23 bash Adding IPv4 address 172.25.16.58/22 to eth0
Dec 15 12:50:23 bash Sending gratuitous ARP: 172.25.16.58
00:30:48:c6:de:24 brd f
f:ff:ff:ff:ff:ff
Dec 15 12:50:24 bash Executing /etc/cluster/MailHost-misc-early start
... startup continues fine
--
Linux-cluster mailing list
Linux-cluster@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/linux-cluster