Hi. The short story... Rush job, never done clustered file systems
before, vlan didn’t support multicast. Thus I ended up with drbd working
ok between the two servers but cman / gfs2 not working, resulting in what was meant
to be a drbd primary/primary cluster being a primary/secondary cluster until
the vlan could be fixed with gfs only mounted on the one server. I got the single
server working and left to for the contractor to do there bit. Two months down
the line and a few other hiccups in the mix I have a server that wont mount the
gfs partition.. assuming that drbd hasn’t gotten confused and lost the
data on the drive.. If I can how do I fix this. Drbd is currently as follows: [root@mcvpsam01 init.d]# drbd-overview 1:r0 WFConnection Primary/Unknown UpToDate/DUnknown
C r---- Cman: [root@mcvpsam01 init.d]# /etc/init.d/cman status groupd is stopped gfs2 mount [root@mcvpsam01 init.d]# ./gfsmount.sh start Mounting gfs2 partition /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: can't connect to gfs_controld: Connection
refused /sbin/mount.gfs2: gfs_controld not running /sbin/mount.gfs2: error mounting lockproto lock_dlm [root@mcvpsam01 init.d]# And log/messages Feb 28 09:20:39 mcvpsam01 openais[3328]: [TOTEM] The
consensus timeout expired. Feb 28 09:20:39 mcvpsam01 openais[3328]: [TOTEM] entering
GATHER state from 3. Feb 28 09:20:54 mcvpsam01 openais[3328]: [TOTEM] The
consensus timeout expired. Feb 28 09:20:54 mcvpsam01 openais[3328]: [TOTEM] entering GATHER
state from 3. Feb 28 09:21:09 mcvpsam01 openais[3328]: [TOTEM] The
consensus timeout expired. Feb 28 09:21:09 mcvpsam01 openais[3328]: [TOTEM] entering
GATHER state from 3. cluster.conf [root@mcvpsam01 init.d]# cat /etc/cluster/cluster.conf <?xml version="1.0"?> <cluster alias="cluster-setup"
config_version="1" name="cluster-setup"> <rm log_level="4"/> <fence_daemon clean_start="1"
post_fail_delay="0" post_join_delay="3"/> <clusternodes> <clusternode
name="mcvpsam01" nodeid="1" votes="1"> <fence> <method
name="2">
<device name="LastResortNode01"/> </method> </fence> </clusternode> <clusternode
name="drvpsam01" nodeid="2" votes="1"> <fence> <method
name="2">
<device name="LastResortNode02"/> </method> </fence> </clusternode> </clusternodes> <cman expected_votes="1"
two_node="1"/> <fencedevices> <fencedevice
agent="fence_manual" name="LastResortNode01"
nodename="mcvpsam01"/> <fencedevice
agent="fence_manual" name="LastResortNode02"
nodename="drvpsam01"/> </fencedevices> <rm/> <totem consensus="4800"
join="60" token="10000" token_retransmits_before_loss_const="20"/> </cluster> [root@mcvpsam01 init.d]# Drbd.conf [root@mcvpsam01 init.d]# cat /etc/drbd.conf resource r0 { protocol C; syncer { rate 1000M; } startup { wfc-timeout
120;
# wait 2min for other peers degr-wfc-timeout
120;
# wait 2min if peer was already
# down before this node was rebooted become-primary-on both; } net { allow-two-primaries; # cram-hmac-alg
"sha1";
# algo to enable peer authentication # shared-secret "123456"; # handle split-brain situations after-sb-0pri discard-least-changes;# if
no primary auto sync from the
# node that touched more blocks during
# the split brain situation. after-sb-1pri
discard-secondary; # if one primary after-sb-2pri
disconnect; # if
two primaries # solve the cases when the outcome # of the resync decision is incompatible # with the current role assignment in # the cluster rr-conflict
disconnect;
# no automatic resynchronization
# simply disconnect } disk { on-io-error
detach;
# detach the device from its
# backing storage if the driver of
# the lower_device reports an error
# to DRBD # fencing resource-and-stonith; } on mcvpsam01 { device
/dev/drbd1; disk /dev/sdb1; address
202.37.1.133:7789; meta-disk
internal; } on drvpsam01 { device
/dev/drbd1; disk /dev/sdb1; address
202.37.1.134:7789; meta-disk
internal; } } [root@mcvpsam01 init.d]# [root@mcvpsam01 init.d]# cat /etc/drbd.d/global_common.conf global { usage-count yes; # minor-count
dialog-refresh disable-ip-verification } common { protocol C; handlers {
pri-on-incon-degr "/usr/lib/drbd/notify-pri-on-incon-degr.sh;
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ;
reboot -f";
pri-lost-after-sb "/usr/lib/drbd/notify-pri-lost-after-sb.sh;
/usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ;
reboot -f";
local-io-error "/usr/lib/drbd/notify-io-error.sh;
/usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ;
halt -f";
# fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
# split-brain "/usr/lib/drbd/notify-split-brain.sh root";
# out-of-sync "/usr/lib/drbd/notify-out-of-sync.sh root";
# before-resync-target "/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15
-- -c 16k";
# after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh; } startup {
# wfc-timeout degr-wfc-timeout outdated-wfc-timeout wait-after-sb } disk {
# on-io-error fencing use-bmbv no-disk-barrier no-disk-flushes
# no-disk-drain no-md-flushes max-bio-bvecs } net {
# sndbuf-size rcvbuf-size timeout connect-int ping-int ping-timeout max-buffers
# max-epoch-size ko-count allow-two-primaries cram-hmac-alg shared-secret
# after-sb-0pri after-sb-1pri after-sb-2pri data-integrity-alg no-tcp-cork } syncer {
# rate after al-extents use-rle cpu-mask verify-alg csums-alg } } [root@mcvpsam01 init.d]# Any ideas how I can get the file system mounted to recover
the data. Thanks Greg Machin |
_______________________________________________ CentOS mailing list CentOS@xxxxxxxxxx http://lists.centos.org/mailman/listinfo/centos