2 node cluster with quorum disk running rhcs on rh5.7.
Attempting to debug why we get the following qdisk issues:
Nov 7 16:24:09 host532 qdiskd[5750]: qdiskd: write (system call) has
hung for 13 seconds
Nov 7 16:24:09 host532 qdiskd[5750]: In 14 more seconds, we will be
evicted
Nov 7 16:24:11 host532 openais[5711]: [CMAN ] lost contact with quorum
device
Nov 7 16:24:29 host532 openais[5711]: [CMAN ] cman killed by node 1
because we were killed by cman_tool or other application
And the node where the time out happens is usually fenced. Is this
something to do with my qdisk interval/ko timings of:
quorumd interval="3" label="qdisk" min_score="1" tko="9" votes="1
cluster.conf attached.
Thanks
Michael
<?xml version="1.0"?>
<cluster alias="oracle_HA" config_version="28" name="oracle_HA">
<fence_daemon clean_start="1" post_fail_delay="0" post_join_delay="30"/>
<clusternodes>
<clusternode name="host531-clusternode" nodeid="1" votes="1">
<fence>
<method name="1">
<device lanplus="1" name="host531_fence"/>
</method>
</fence>
</clusternode>
<clusternode name="host532-clusternode" nodeid="2" votes="1">
<fence>
<method name="1">
<device lanplus="1" name="host532_fence"/>
</method>
</fence>
</clusternode>
</clusternodes>
<cman broadcast="yes" expected_votes="3"/>
<fencedevices>
<fencedevice agent="fence_ipmilan" ipaddr="host531-ilo" login="fence_host531" name="host531_fence" passwd="fakepass" lanplus="1" auth="password"/>
<fencedevice agent="fence_ipmilan" ipaddr="host532-ilo" login="fence_host532" name="host532_fence" passwd="fakepass" lanplus="1" auth="password"/>
</fencedevices>
<rm>
<failoverdomains>
<failoverdomain name="oracle_failover" nofailback="1" ordered="1" restricted="1">
<failoverdomainnode name="host531-clusternode" priority="1"/>
<failoverdomainnode name="host532-clusternode" priority="2"/>
</failoverdomain>
</failoverdomains>
<resources>
<ip address="192.168.146.240" monitor_link="1"/>
<ip address="192.168.98.240" monitor_link="1"/>
<clusterfs device="/dev/mapper/vglivedatabase-lvlivedatabase" force_unmount="1" fsid="42792" fstype="gfs2" mountpoint="/database" name="livedatabase" self_fence="0"/>
<clusterfs device="/dev/mapper/vgliveflashback-lvliveflashback" force_unmount="1" fsid="29561" fstype="gfs2" mountpoint="/flashback" name="liveflashback" self_fence="0"/>
<clusterfs device="/dev/mapper/vgliveflashbackarchive-lvliveflashbackarchive" force_unmount="1" fsid="28043" fstype="gfs2" mountpoint="/flashbackarchive" name="liveflashbackarchive" self_fence="0"/>
<clusterfs device="/dev/mapper/vgliveredo1-lvliveredo1" force_unmount="1" fsid="1606" fstype="gfs2" mountpoint="/redo1" name="liveredo1" self_fence="0"/>
<clusterfs device="/dev/mapper/vgliveredo2-lvliveredo2" force_unmount="1" fsid="22524" fstype="gfs2" mountpoint="/redo2" name="liveredo2" self_fence="0"/>
<script file="/etc/init.d/oracle" name="oracle_script"/>
</resources>
<service autostart="1" domain="oracle_failover" exclusive="0" name="oracledb" recovery="relocate">
<ip ref="192.168.146.240"/>
<ip ref="192.168.98.240">
<clusterfs ref="livedatabase">
<clusterfs ref="liveflashback">
<clusterfs ref="liveflashbackarchive">
<clusterfs ref="liveredo1">
<clusterfs ref="liveredo2">
<script ref="oracle_script"/>
</clusterfs>
</clusterfs>
</clusterfs>
</clusterfs>
</clusterfs>
</ip>
</service>
</rm>
<quorumd interval="3" label="qdisk" min_score="1" tko="9" votes="1">
<heuristic interval="5" program="ping -t1 -c1 192.168.98.1" score="1"/>
<heuristic interval="5" program="ping -t1 -c1 192.168.146.1" score="1"/>
</quorumd>
</cluster>
--
Linux-cluster mailing list
Linux-cluster@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/linux-cluster