Re: Corosync has quorum when cluster is not formed

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



kandeshvari,
plase make sure that localhost (lo) is not blocked. When it is, corosync
cannot form one node cluster and behavior is undefined (and mostly
incorrect).

Regards,
  Honza

kandeshvari galupta napsal(a):
> Hi all!
> 
> I'm playing with the latest version of corosync on three nodes (ha01, ha02,
> ha03). All three nodes in two redundant rings (see config below). I
> simulate network failure on ha03 node by enabling firewall rule
> 
> [root@ha03 ~]# iptables -A INPUT -j REJECT
> 
> As I expect ha01 and ha02 nodes lost the connections to ha03, but quorate
> still present:
> 
> [root@ha01 ~]# corosync-quorumtool
> Quorum information
> ------------------
> Date:             Fri Oct  4 18:21:56 2013
> Quorum provider:  corosync_votequorum
> Nodes:            2
> Node ID:          1
> Ring ID:          30160
> Quorate:          Yes
> 
> Votequorum information
> ----------------------
> Expected votes:   3
> Highest expected: 3
> Total votes:   2
> Quorum:           2
> Flags:            Quorate
> 
> Membership information
> ----------------------
>     Nodeid Votes Name
>          1          1 10.0.0.29 (local)
>          2          1 10.0.0.31
> 
> 
> BUT node ha03 also has a quorum with all other nodes!!!
> 
> [root@ha03 ~]# corosync-quorumtool
> Quorum information
> ------------------
> Date:             Fri Oct  4 18:06:57 2013
> Quorum provider:  corosync_votequorum
> Nodes:            3
> Node ID:          3
> Ring ID:          2220
> Quorate:          Yes
> 
> Votequorum information
> ----------------------
> Expected votes:   3
> Highest expected: 3
> Total votes:      3
> Quorum:           2
> Flags:            Quorate
> 
> Membership information
> ----------------------
>     Nodeid      Votes Name
>          1          1 10.0.0.29
>          2          1 10.0.0.31
>          3          1 10.0.0.32 (local)
> 
> 
> in the log files I see this lines:
> 
> Oct 04 18:06:01 [731] ha03 corosync warning [MAIN  ] Totem is unable to
> form a cluster because of an operating system or network fault. The most
> common cause of this message is that the local firewall is configured
> improperly.
> Oct 04 18:06:02 [731] ha03 corosync warning [MAIN  ] Totem is unable to
> form a cluster because of an operating system or network fault. The most
> common cause of this message is that the local firewall is configured
> improperly.
> Oct 04 18:06:04 [731] ha03 corosync warning [MAIN  ] Totem is unable to
> form a cluster because of an operating system or network fault. The most
> common cause of this message is that the local firewall is configured
> improperly.
> 
> 
> As I understand there is no cluster on node ha03 but this node has quorum
> with other nodes. How can it possible? Why ha03 didn't loose a quorum? Is
> it an expected behavior?  Can someone explain me this situation?
> 
> Thanks in advance
> 
> ==================[config & other logs]====================
> #  rpm -qa | grep corosync
> corosync-2.3.2-1.fc19.x86_64
> corosynclib-2.3.2-1.fc19.x86_64
> 
> # /etc/corosync/corosync.conf
> totem {
> version: 2
> crypto_cipher: none
> crypto_hash: none
> rrp_mode: passive
> 
> interface {
> ringnumber: 0
> bindnetaddr: 10.0.0.0
> mcastport: 5405
> ttl: 1
> }
> interface {
> ringnumber: 1
> bindnetaddr: 192.168.20.0
> mcastport: 5405
> ttl: 1
> }
> transport: udpu
> }
> 
> logging {
> fileline: off
> to_logfile: yes
> to_syslog: yes
> logfile: /var/log/cluster/corosync.log
> debug: off
> timestamp: on
> logger_subsys {
> subsys: QUORUM
> debug: off
> }
> }
> 
> nodelist {
> node {
> ring0_addr: 10.0.0.31
> ring1_addr: 192.168.20.31
> nodeid: 2
> }
> node {
> ring0_addr: 10.0.0.29
> ring1_addr: 192.168.20.29
> nodeid: 1
> }
> node {
> ring0_addr: 10.0.0.32
> ring1_addr: 192.168.20.32
> nodeid: 3
> }
> }
> 
> quorum {
> provider: corosync_votequorum
> }
> 
> [root@ha03 ~]# corosync-cmapctl
> internal_configuration.service.0.name (str) = corosync_cmap
> internal_configuration.service.0.ver (u32) = 0
> internal_configuration.service.1.name (str) = corosync_cfg
> internal_configuration.service.1.ver (u32) = 0
> internal_configuration.service.2.name (str) = corosync_cpg
> internal_configuration.service.2.ver (u32) = 0
> internal_configuration.service.3.name (str) = corosync_quorum
> internal_configuration.service.3.ver (u32) = 0
> internal_configuration.service.4.name (str) = corosync_pload
> internal_configuration.service.4.ver (u32) = 0
> internal_configuration.service.5.name (str) = corosync_votequorum
> internal_configuration.service.5.ver (u32) = 0
> logging.debug (str) = off
> logging.fileline (str) = off
> logging.logfile (str) = /var/log/cluster/corosync.log
> logging.logger_subsys.QUORUM.debug (str) = off
> logging.logger_subsys.QUORUM.subsys (str) = QUORUM
> logging.timestamp (str) = on
> logging.to_logfile (str) = yes
> logging.to_syslog (str) = yes
> nodelist.local_node_pos (u32) = 2
> nodelist.node.0.nodeid (u32) = 2
> nodelist.node.0.ring0_addr (str) = 10.0.0.31
> nodelist.node.0.ring1_addr (str) = 192.168.20.31
> nodelist.node.1.nodeid (u32) = 1
> nodelist.node.1.ring0_addr (str) = 10.0.0.29
> nodelist.node.1.ring1_addr (str) = 192.168.20.29
> nodelist.node.2.nodeid (u32) = 3
> nodelist.node.2.ring0_addr (str) = 10.0.0.32
> nodelist.node.2.ring1_addr (str) = 192.168.20.32
> quorum.provider (str) = corosync_votequorum
> runtime.blackbox.dump_flight_data (str) = no
> runtime.blackbox.dump_state (str) = no
> runtime.connections.active (u64) = 1
> runtime.connections.closed (u64) = 416
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.client_pid (u32) =
> 1019
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.dispatched (u64) = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.flow_control (u32)
> = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.flow_control_count
> (u64) = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.invalid_request
> (u64) = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.name (str) =
> corosync-cmapct
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.overload (u64) = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.queue_size (u32) = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.recv_retries (u64)
> = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.requests (u64) = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.responses (u64) = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.send_retries (u64)
> = 0
> runtime.connections.corosync-cmapct:1019:0x7fe8fb6ed320.service_id (u32) = 0
> runtime.services.cfg.0.rx (u64) = 0
> runtime.services.cfg.0.tx (u64) = 0
> runtime.services.cfg.1.rx (u64) = 0
> runtime.services.cfg.1.tx (u64) = 0
> runtime.services.cfg.2.rx (u64) = 0
> runtime.services.cfg.2.tx (u64) = 0
> runtime.services.cfg.3.rx (u64) = 0
> runtime.services.cfg.3.tx (u64) = 0
> runtime.services.cfg.service_id (u16) = 1
> runtime.services.cmap.0.rx (u64) = 3
> runtime.services.cmap.0.tx (u64) = 1
> runtime.services.cmap.service_id (u16) = 0
> runtime.services.cpg.0.rx (u64) = 0
> runtime.services.cpg.0.tx (u64) = 0
> runtime.services.cpg.1.rx (u64) = 0
> runtime.services.cpg.1.tx (u64) = 0
> runtime.services.cpg.2.rx (u64) = 0
> runtime.services.cpg.2.tx (u64) = 0
> runtime.services.cpg.3.rx (u64) = 0
> runtime.services.cpg.3.tx (u64) = 0
> runtime.services.cpg.4.rx (u64) = 0
> runtime.services.cpg.4.tx (u64) = 0
> runtime.services.cpg.5.rx (u64) = 3
> runtime.services.cpg.5.tx (u64) = 1
> runtime.services.cpg.service_id (u16) = 2
> runtime.services.pload.0.rx (u64) = 0
> runtime.services.pload.0.tx (u64) = 0
> runtime.services.pload.1.rx (u64) = 0
> runtime.services.pload.1.tx (u64) = 0
> runtime.services.pload.service_id (u16) = 4
> runtime.services.quorum.service_id (u16) = 3
> runtime.services.votequorum.0.rx (u64) = 7
> runtime.services.votequorum.0.tx (u64) = 2
> runtime.services.votequorum.1.rx (u64) = 0
> runtime.services.votequorum.1.tx (u64) = 0
> runtime.services.votequorum.2.rx (u64) = 0
> runtime.services.votequorum.2.tx (u64) = 0
> runtime.services.votequorum.3.rx (u64) = 0
> runtime.services.votequorum.3.tx (u64) = 0
> runtime.services.votequorum.service_id (u16) = 5
> runtime.totem.pg.mrp.rrp.0.faulty (u8) = 0
> runtime.totem.pg.mrp.rrp.1.faulty (u8) = 0
> runtime.totem.pg.mrp.srp.avg_backlog_calc (u32) = 0
> runtime.totem.pg.mrp.srp.avg_token_workload (u32) = 0
> runtime.totem.pg.mrp.srp.commit_entered (u64) = 2
> runtime.totem.pg.mrp.srp.commit_token_lost (u64) = 0
> runtime.totem.pg.mrp.srp.consensus_timeouts (u64) = 543
> runtime.totem.pg.mrp.srp.continuous_gather (u32) = 271
> runtime.totem.pg.mrp.srp.continuous_sendmsg_failures (u32) = 0
> runtime.totem.pg.mrp.srp.firewall_enabled_or_nic_failure (u8) = 1
> runtime.totem.pg.mrp.srp.gather_entered (u64) = 275
> runtime.totem.pg.mrp.srp.gather_token_lost (u64) = 271
> runtime.totem.pg.mrp.srp.mcast_retx (u64) = 0
> runtime.totem.pg.mrp.srp.mcast_rx (u64) = 17
> runtime.totem.pg.mrp.srp.mcast_tx (u64) = 7
> runtime.totem.pg.mrp.srp.memb_commit_token_rx (u64) = 6
> runtime.totem.pg.mrp.srp.memb_commit_token_tx (u64) = 6
> runtime.totem.pg.mrp.srp.memb_join_rx (u64) = 4
> runtime.totem.pg.mrp.srp.memb_join_tx (u64) = 7988
> runtime.totem.pg.mrp.srp.memb_merge_detect_rx (u64) = 1014
> runtime.totem.pg.mrp.srp.memb_merge_detect_tx (u64) = 0
> runtime.totem.pg.mrp.srp.members.1.config_version (u64) = 0
> runtime.totem.pg.mrp.srp.members.1.ip (str) = r(0) ip(10.0.0.29) r(1)
> ip(192.168.20.29)
> runtime.totem.pg.mrp.srp.members.1.join_count (u32) = 1
> runtime.totem.pg.mrp.srp.members.1.status (str) = joined
> runtime.totem.pg.mrp.srp.members.2.config_version (u64) = 0
> runtime.totem.pg.mrp.srp.members.2.ip (str) = r(0) ip(10.0.0.31) r(1)
> ip(192.168.20.31)
> runtime.totem.pg.mrp.srp.members.2.join_count (u32) = 1
> runtime.totem.pg.mrp.srp.members.2.status (str) = joined
> runtime.totem.pg.mrp.srp.members.3.config_version (u64) = 0
> runtime.totem.pg.mrp.srp.members.3.ip (str) = r(0) ip(10.0.0.32) r(1)
> ip(192.168.20.32)
> runtime.totem.pg.mrp.srp.members.3.join_count (u32) = 1
> runtime.totem.pg.mrp.srp.members.3.status (str) = joined
> runtime.totem.pg.mrp.srp.mtt_rx_token (u32) = 188
> runtime.totem.pg.mrp.srp.operational_entered (u64) = 2
> runtime.totem.pg.mrp.srp.operational_token_lost (u64) = 1
> runtime.totem.pg.mrp.srp.orf_token_rx (u64) = 2067
> runtime.totem.pg.mrp.srp.orf_token_tx (u64) = 1
> runtime.totem.pg.mrp.srp.recovery_entered (u64) = 2
> runtime.totem.pg.mrp.srp.recovery_token_lost (u64) = 0
> runtime.totem.pg.mrp.srp.rx_msg_dropped (u64) = 0
> runtime.totem.pg.mrp.srp.token_hold_cancel_rx (u64) = 0
> runtime.totem.pg.mrp.srp.token_hold_cancel_tx (u64) = 0
> runtime.totem.pg.msg_queue_avail (u32) = 0
> runtime.totem.pg.msg_reserved (u32) = 1
> runtime.votequorum.ev_barrier (u32) = 3
> runtime.votequorum.lowest_node_id (u32) = 1
> runtime.votequorum.this_node_id (u32) = 3
> runtime.votequorum.two_node (u8) = 0
> totem.crypto_cipher (str) = none
> totem.crypto_hash (str) = none
> totem.interface.0.bindnetaddr (str) = 10.0.0.0
> totem.interface.0.mcastport (u16) = 5405
> totem.interface.0.ttl (u8) = 1
> totem.interface.1.bindnetaddr (str) = 192.168.20.0
> totem.interface.1.mcastport (u16) = 5405
> totem.interface.1.ttl (u8) = 1
> totem.rrp_mode (str) = passive
> totem.transport (str) = udpu
> totem.version (u32) = 2
> 
> 
> 
> _______________________________________________
> discuss mailing list
> discuss@xxxxxxxxxxxx
> http://lists.corosync.org/mailman/listinfo/discuss
> 

_______________________________________________
discuss mailing list
discuss@xxxxxxxxxxxx
http://lists.corosync.org/mailman/listinfo/discuss




[Index of Archives]     [Linux Clusters]     [Corosync Project]     [Linux USB Devel]     [Linux Audio Users]     [Photo]     [Yosemite News]    [Yosemite Photos]    [Linux Kernel]     [Linux SCSI]     [X.Org]

  Powered by Linux