Hi. I'm having some issues with a two-node failover cluster on RHEL4/U3 with kernel 2.6.9-34.0.1.ELsmp, ccs-1.0.3-0, cman-1.0.4-0, fence-1.32.18-0 and rgmanager-1.9.46-0. After a mishap where I accidentaly caused a failover of services with power fencing of server01, the system will not rejoin the cluster after boot. I have tried using both the init.d scripts and starting the daemons manually to troubleshoot this further, to no avail. I'm able to start ccsd properly (although it logs the cluster as inquorate) but it fails completely on cman, claiming that connection is refused. If anyone could help me by giving me some tips, directing me to the proper documentation addressing this issue or downright pointing out my problem, I would be most grateful. [server01] # service ccsd start Starting ccsd: [ OK ] ---8<--- /var/log/messages Sep 14 00:33:28 server01 ccsd[30227]: Starting ccsd 1.0.3: Sep 14 00:33:28 server01 ccsd[30227]: Built: Jan 25 2006 16:54:43 Sep 14 00:33:28 server01 ccsd[30227]: Copyright (C) Red Hat, Inc. 2004 All rights reserved. Sep 14 00:33:28 server01 ccsd[30227]: Connected to cluster infrastruture via: CMAN/SM Plugin v1.1.5 Sep 14 00:33:28 server01 ccsd[30227]: Initial status:: Inquorate Sep 14 00:33:29 server01 ccsd: startup succeeded ---8<--- [server01] # service cman start Starting cman: [FAILED] ---8<--- /var/log/messages Sep 14 00:39:07 server01 ccsd[31417]: Cluster is not quorate. Refusing connection. Sep 14 00:39:07 server01 ccsd[31417]: Error while processing connect: Connection refused Sep 14 00:39:07 server01 ccsd[31417]: cluster.conf (cluster name = something_cluster, version = 46) found. Sep 14 00:39:07 server01 ccsd[31417]: Remote copy of cluster.conf is from quorate node. Sep 14 00:39:07 server01 ccsd[31417]: Local version # : 46 Sep 14 00:39:07 server01 ccsd[31417]: Remote version #: 46 Sep 14 00:39:07 server01 cman: cman_tool: Node is already active failed Sep 14 00:39:12 server01 kernel: CMAN: sending membership request ---8<--- [server01] # cat /proc/cluster/status Protocol version: 5.0.1 Config version: 46 Cluster name: something_cluster Cluster ID: 47540 Cluster Member: No Membership state: Joining [server01] # cat /proc/cluster/nodes Node Votes Exp Sts Name [server02] # cat /proc/cluster/status Protocol version: 5.0.1 Config version: 46 Cluster name: something_cluster Cluster ID: 47540 Cluster Member: Yes Membership state: Cluster-Member Nodes: 1 Expected_votes: 1 Total_votes: 1 Quorum: 1 Active subsystems: 4 Node name: server02 Node addresses: xx.xx.xx.134 [server02] # cat /proc/cluster/nodes Node Votes Exp Sts Name 1 1 1 X server01 2 1 1 M server02 [server01] # cat /etc/cluster/cluster.conf ---8<--- <?xml version="1.0"?> <cluster config_version="46" name="something_cluster"> <fence_daemon post_fail_delay="0" post_join_delay="30"/> <clusternodes> <clusternode name="server01" votes="1"> <fence> <method name="1"> <device name="APC-LEFT" option="off" port="8" switch="0"/> <device name="APC-RIGHT" option="off" port="8" switch="0"/> <device name="APC-LEFT" option="on" port="8" switch="0"/> <device name="APC-RIGHT" option="on" port="8" switch="0"/> </method> </fence> </clusternode> <clusternode name="server02" votes="1"> <fence> <method name="1"> <device name="APC-LEFT" option="off" port="4" switch="0"/> <device name="APC-RIGHT" option="off" port="4" switch="0"/> <device name="APC-LEFT" option="on" port="4" switch="0"/> <device name="APC-RIGHT" option="on" port="4" switch="0"/> </method> </fence> </clusternode> </clusternodes> <cman expected_votes="1" two_node="1"/> <fencedevices> <fencedevice agent="fence_apc" ipaddr="xx.xx.xx.10" login="secret" name="APC-LEFT" passwd="secret"/> <fencedevice agent="fence_apc" ipaddr="xx.xx.xx.11" login="secret" name="APC-RIGHT" passwd="secret"/> </fencedevices> <rm> <failoverdomains> <failoverdomain name="OX" ordered="1" restricted="0"> <failoverdomainnode name="server01" priority="1"/> </failoverdomain> <failoverdomain name="IMAP" ordered="1" restricted="0"> <failoverdomainnode name="server01" priority="1"/> </failoverdomain> <failoverdomain name="NFS" ordered="1" restricted="0"> <failoverdomainnode name="server02" priority="1"/> </failoverdomain> <failoverdomain name="LDAP" ordered="1"> <failoverdomainnode name="server02" priority="1"/> </failoverdomain> <failoverdomain name="PGSQL" ordered="1" restricted="0"> <failoverdomainnode name="server02" priority="1"/> </failoverdomain> </failoverdomains> <resources/> <service autostart="1" domain="PGSQL" name="OX-OX"> <script file="/etc/init.d/openexchange" name="OX"/> <ip address="192.168.xx.xx" monitor_link="1"/> <fs device="/dev/emcpowera9" force_fsck="0" force_unmount="1" fsid="39155" fstype="ext3" mountpoint="/var/opt/openexchange/filespool" name="OX" options="" self_fence="0"/> <script file="/etc/init.d/openexchange-daemons" name="XMLRPC"/> <script file="/etc/init.d/tomcat5" name="Tomcat"/> <ip address="192.168.xx.xx" monitor_link="1"/> </service> <service autostart="1" domain="IMAP" name="OX-IMAP"> <ip address="192.168.xx.xx" monitor_link="1"/> <fs device="/dev/emcpowera7" force_fsck="0" force_unmount="1" fsid="63880" fstype="ext3" mountpoint="/var/lib/imap" name="IMAP" options="" self_fence="0"/> <fs device="/dev/emcpowera10" force_fsck="0" force_unmount="1" fsid="63324" fstype="ext3" mountpoint="/var/spool/imap1" name="IMAP1" options="" self_fence="0"/> <script file="/etc/init.d/saslauthd" name="SASL"/> <script file="/etc/init.d/cyrus-imapd" name="Cyrus"/> <fs device="/dev/emcpowerb5" force_fsck="0" force_unmount="1" fsid="42726" fstype="ext3" mountpoint="/var/spool/imap2" name="IMAP2" options="" self_fence="0"/> <fs device="/dev/emcpowerb6" force_fsck="0" force_unmount="1" fsid="38512" fstype="ext3" mountpoint="/var/spool/imap3" name="IMAP3" options="" self_fence="0"/> <fs device="/dev/emcpowerc5" force_fsck="0" force_unmount="1" fsid="979" fstype="ext3" mountpoint="/var/spool/imap4" name="IMAP4" options="" self_fence="0"/> <fs device="/dev/emcpowerc6" force_fsck="0" force_unmount="1" fsid="13125" fstype="ext3" mountpoint="/var/spool/imap5" name="IMAP5" options="" self_fence="0"/> </service> <service autostart="1" domain="NFS" name="OX-NFS"> <ip address="192.168.xx.xx" monitor_link="1"/> <fs device="/dev/emcpowera8" force_fsck="0" force_unmount="1" fsid="37141" fstype="ext3" mountpoint="/var/lib/xxxxxxxx" name="NFS" options="" self_fence="0"/> <script file="/etc/init.d/nfs" name="NFS"/> <script file="/etc/init.d/nfslock" name="NFSLOCK"/> </service> <service autostart="1" domain="LDAP" name="OX-LDAP"> <ip address="192.168.xx.xx" monitor_link="1"/> <fs device="/dev/emcpowerb8" force_fsck="0" force_unmount="1" fsid="12853" fstype="ext3" mountpoint="/var/symas/openldap-data" name="DATA" options="" self_fence="0"/> <fs device="/dev/emcpowerb9" force_fsck="0" force_unmount="1" fsid="11240" fstype="ext3" mountpoint="/var/symas/openldap-logs" name="LOGS" options="" self_fence="0"/> <fs device="/dev/emcpowerb10" force_fsck="0" force_unmount="1" fsid="10234" fstype="ext3" mountpoint="/var/symas/openldap-slurp" name="SLURP" options="" self_fence="0"/> <script file="/etc/init.d/cdsserver" name="LDAP"/> </service> <service autostart="1" domain="PGSQL" name="OX-PGSQL"> <ip address="192.168.xx.xx" monitor_link="1"/> <fs device="/dev/emcpowera5" force_fsck="0" force_unmount="1" fsid="43285" fstype="ext3" mountpoint="/var/lib/pgsql" name="PGSQL" options="" self_fence="0"/> <script file="/etc/init.d/postgresql" name="PGSQL"/> </service> </rm> </cluster> ---8<--- [server01] # cat /etc/hosts ---8<--- 127.0.0.1 localhost.localdomain localhost xx.xx.xx.133 server01.example.com server01 xx.xx.xx.134 server02.example.com server02 ---8<--- Thanks, .../Bosse -- Linux-cluster mailing list Linux-cluster@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/linux-cluster