qdisk timeouts

Michael Pye <michael@xxxxxxxxxx> · Wed, 09 Nov 2011 20:23:41 +0000

2 node cluster with quorum disk running rhcs on rh5.7.

Attempting to debug why we get the following qdisk issues:
Nov  7 16:24:09 host532 qdiskd[5750]:  qdiskd: write (system call) has 
hung for 13 seconds
Nov  7 16:24:09 host532 qdiskd[5750]:  In 14 more seconds, we will be 
evicted
Nov  7 16:24:11 host532 openais[5711]: [CMAN ] lost contact with quorum 
device
Nov  7 16:24:29 host532 openais[5711]: [CMAN ] cman killed by node 1 
because we were killed by cman_tool or other application

And the node where the time out happens is usually fenced. Is this 
something to do with my qdisk interval/ko timings of:
quorumd interval="3" label="qdisk" min_score="1" tko="9" votes="1

cluster.conf attached.

Thanks
Michael
<?xml version="1.0"?>
<cluster alias="oracle_HA" config_version="28" name="oracle_HA">
        <fence_daemon clean_start="1" post_fail_delay="0" post_join_delay="30"/>
        <clusternodes>
                <clusternode name="host531-clusternode" nodeid="1" votes="1">
                        <fence>
                                <method name="1">
                                        <device lanplus="1" name="host531_fence"/>
                                </method>
                        </fence>
                </clusternode>
                <clusternode name="host532-clusternode" nodeid="2" votes="1">
                        <fence>
                                <method name="1">
                                        <device lanplus="1" name="host532_fence"/>
                                </method>
                        </fence>
                </clusternode>
        </clusternodes>
        <cman broadcast="yes" expected_votes="3"/>
        <fencedevices>
                <fencedevice agent="fence_ipmilan" ipaddr="host531-ilo" login="fence_host531" name="host531_fence" passwd="fakepass" lanplus="1" auth="password"/>
                <fencedevice agent="fence_ipmilan" ipaddr="host532-ilo" login="fence_host532" name="host532_fence" passwd="fakepass" lanplus="1" auth="password"/>
        </fencedevices>
        <rm>
                <failoverdomains>
                        <failoverdomain name="oracle_failover" nofailback="1" ordered="1" restricted="1">
                                <failoverdomainnode name="host531-clusternode" priority="1"/>
                                <failoverdomainnode name="host532-clusternode" priority="2"/>
                        </failoverdomain>
                </failoverdomains>
                <resources>
                        <ip address="192.168.146.240" monitor_link="1"/>
                        <ip address="192.168.98.240" monitor_link="1"/>
                        <clusterfs device="/dev/mapper/vglivedatabase-lvlivedatabase" force_unmount="1" fsid="42792" fstype="gfs2" mountpoint="/database" name="livedatabase" self_fence="0"/>
                        <clusterfs device="/dev/mapper/vgliveflashback-lvliveflashback" force_unmount="1" fsid="29561" fstype="gfs2" mountpoint="/flashback" name="liveflashback" self_fence="0"/>
                        <clusterfs device="/dev/mapper/vgliveflashbackarchive-lvliveflashbackarchive" force_unmount="1" fsid="28043" fstype="gfs2" mountpoint="/flashbackarchive" name="liveflashbackarchive" self_fence="0"/>
                        <clusterfs device="/dev/mapper/vgliveredo1-lvliveredo1" force_unmount="1" fsid="1606" fstype="gfs2" mountpoint="/redo1" name="liveredo1" self_fence="0"/>
                        <clusterfs device="/dev/mapper/vgliveredo2-lvliveredo2" force_unmount="1" fsid="22524" fstype="gfs2" mountpoint="/redo2" name="liveredo2" self_fence="0"/>
                        <script file="/etc/init.d/oracle" name="oracle_script"/>
                </resources>
                <service autostart="1" domain="oracle_failover" exclusive="0" name="oracledb" recovery="relocate">
                        <ip ref="192.168.146.240"/>
                        <ip ref="192.168.98.240">
                                <clusterfs ref="livedatabase">
                                        <clusterfs ref="liveflashback">
                                                <clusterfs ref="liveflashbackarchive">
                                                        <clusterfs ref="liveredo1">
                                                                <clusterfs ref="liveredo2">
                                                                        <script ref="oracle_script"/>
                                                                </clusterfs>
                                                        </clusterfs>
                                                </clusterfs>
                                        </clusterfs>
                                </clusterfs>
                        </ip>
                </service>
        </rm>
        <quorumd interval="3" label="qdisk" min_score="1" tko="9" votes="1">
                <heuristic interval="5" program="ping -t1 -c1 192.168.98.1" score="1"/>
                <heuristic interval="5" program="ping -t1 -c1 192.168.146.1" score="1"/>
        </quorumd>
</cluster>
--
Linux-cluster mailing list
Linux-cluster@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/linux-cluster