This patch adds more flexibility to the auto_tie_breaker feature of
votequorum. With this, not only can the lowest nodeid be used as
a tie breaker, but also the highest, or a node from a nominated list.
If there is a list of nodes, the first node in the list that was not
part of the previous partition is used. This allows the user to
specify a preferred set of nodes but prevents a split-brain if the
cluster divides evenly with a node in each half.
Signed-Off-By: Christine Caulfield <ccaulfie@xxxxxxxxxx>
diff --git a/exec/votequorum.c b/exec/votequorum.c
index 1b6a4eb..3e4e485 100644
--- a/exec/votequorum.c
+++ b/exec/votequorum.c
@@ -78,8 +78,9 @@ static uint8_t two_node = 0;
static uint8_t wait_for_all = 0;
static uint8_t wait_for_all_status = 0;
-static uint8_t auto_tie_breaker = 0;
+static enum {ATB_NONE, ATB_LOWEST, ATB_HIGHEST, ATB_LIST} auto_tie_breaker = ATB_NONE;
static int lowest_node_id = -1;
+static int highest_node_id = -1;
#define DEFAULT_LMS_WIN 10000
static uint8_t last_man_standing = 0;
@@ -197,7 +198,11 @@ static uint8_t cluster_is_quorate;
static struct cluster_node *us;
static struct list_head cluster_members_list;
static unsigned int quorum_members[PROCESSOR_COUNT_MAX];
+static unsigned int previous_quorum_members[PROCESSOR_COUNT_MAX];
+static unsigned int atb_nodelist[PROCESSOR_COUNT_MAX];
static int quorum_members_entries = 0;
+static int previous_quorum_members_entries = 0;
+static int atb_nodelist_entries = 0;
static struct memb_ring_id quorum_ringid;
/*
@@ -537,6 +542,28 @@ static void get_lowest_node_id(void)
LEAVE();
}
+static void get_highest_node_id(void)
+{
+ struct cluster_node *node = NULL;
+ struct list_head *tmp;
+
+ ENTER();
+
+ highest_node_id = us->node_id;
+
+ list_iterate(tmp, &cluster_members_list) {
+ node = list_entry(tmp, struct cluster_node, list);
+ if ((node->state == NODESTATE_MEMBER) &&
+ (node->node_id > highest_node_id)) {
+ highest_node_id = node->node_id;
+ }
+ }
+ log_printf(LOGSYS_LEVEL_DEBUG, "highest node id: %d us: %d", highest_node_id, us->node_id);
+ icmap_set_uint32("runtime.votequorum.highest_node_id", highest_node_id);
+
+ LEAVE();
+}
+
static int check_low_node_id_partition(void)
{
struct cluster_node *node = NULL;
@@ -557,6 +584,151 @@ static int check_low_node_id_partition(void)
return found;
}
+static int check_high_node_id_partition(void)
+{
+ struct cluster_node *node = NULL;
+ struct list_head *tmp;
+ int found = 0;
+
+ ENTER();
+
+ list_iterate(tmp, &cluster_members_list) {
+ node = list_entry(tmp, struct cluster_node, list);
+ if ((node->state == NODESTATE_MEMBER) &&
+ (node->node_id == highest_node_id)) {
+ found = 1;
+ }
+ }
+
+ LEAVE();
+ return found;
+}
+
+static int is_in_nodelist(int nodeid, unsigned int *members, int entries)
+{
+ int i;
+ ENTER();
+
+ for (i=0; i<entries; i++) {
+ if (nodeid == members[i]) {
+ LEAVE();
+ return 1;
+ }
+ }
+ LEAVE();
+ return 0;
+}
+
+/*
+ * The algorithm for a list of time-breaker nodes is:
+ * travel the list of nodes in the auto_tie_breaker list,
+ * if the node IS in our current partition, check if the
+ * nodes earlier in the atb list are in the 'previous' partition;
+ * If none are found then we are safe to be quorate, if any are
+ * then we cannot be as we don't know if that node is up or down.
+ * If we don't have a node in the current list we are NOT quorate.
+ * Obviously if we find the first node in the atb list in our
+ * partition then we are quorate.
+ *
+ * Special cases lowest nodeid, and highest nodeid are handled separately.
+ */
+static int check_auto_tie_breaker(void)
+{
+ int i, j;
+ int res;
+ ENTER();
+
+ if (auto_tie_breaker == ATB_LOWEST) {
+ res = check_low_node_id_partition();
+ log_printf(LOGSYS_LEVEL_DEBUG, "ATB_LOWEST decision: %d", res);
+ LEAVE();
+ return res;
+ }
+ if (auto_tie_breaker == ATB_HIGHEST) {
+ res = check_high_node_id_partition();
+ log_printf(LOGSYS_LEVEL_DEBUG, "ATB_HIGHEST decision: %d", res);
+ LEAVE();
+ return res;
+ }
+
+ /* Assume ATB_LIST, we should never be called for ATB_NONE */
+ for (i=0; i < atb_nodelist_entries; i++) {
+ if (is_in_nodelist(atb_nodelist[i], quorum_members, quorum_members_entries)) {
+ /*
+ * Node is in our partition, if any of its predecessors are
+ * in the previous quorum partition then it might be in the
+ * 'other half' (as we've got this far without seeing it here)
+ * and so we can't be quorate.
+ */
+ for (j=0; j<i; j++) {
+ if (is_in_nodelist(atb_nodelist[j], previous_quorum_members, previous_quorum_members_entries)) {
+ log_printf(LOGSYS_LEVEL_DEBUG, "ATB_LIST found node %d in previous partition but not here, quorum denied", atb_nodelist[j]);
+ LEAVE();
+ return 0;
+ }
+ }
+
+ /*
+ * None of the other list nodes were in the previous partition, if there
+ * are enough votes, we can be quorate
+ */
+ log_printf(LOGSYS_LEVEL_DEBUG, "ATB_LIST found node %d in current partition, we can be quorate", atb_nodelist[i]);
+ LEAVE();
+ return 1;
+ }
+ }
+ log_printf(LOGSYS_LEVEL_DEBUG, "ATB_LIST found no list nodes in current partition, we cannot be quorate");
+ LEAVE();
+ return 0;
+}
+
+/*
+ * atb_string can be either:
+ * 'lowest'
+ * 'highest'
+ * a list of nodeids
+ */
+static void parse_atb_string(char *atb_string)
+{
+ char *ptr;
+ long num;
+
+ ENTER();
+ auto_tie_breaker = ATB_NONE;
+
+ if (!strcmp(atb_string, "lowest"))
+ auto_tie_breaker = ATB_LOWEST;
+
+ if (!strcmp(atb_string, "highest"))
+ auto_tie_breaker = ATB_HIGHEST;
+
+ if (atoi(atb_string)) {
+
+ atb_nodelist_entries = 0;
+ ptr = atb_string;
+ do {
+ num = strtol(ptr, &ptr, 10);
+ if (num) {
+ log_printf(LOGSYS_LEVEL_DEBUG, "ATB nodelist[%d] = %d", atb_nodelist_entries, num);
+ atb_nodelist[atb_nodelist_entries++] = num;
+ }
+ } while (num);
+
+ if (atb_nodelist_entries) {
+ auto_tie_breaker = ATB_LIST;
+ }
+ }
+ icmap_set_uint32("runtime.votequorum.atb_type", auto_tie_breaker);
+ log_printf(LOGSYS_LEVEL_DEBUG, "ATB type = %d", auto_tie_breaker);
+
+ /* Make sure we got something */
+ if (auto_tie_breaker == ATB_NONE) {
+ log_printf(LOGSYS_LEVEL_WARNING, "auto_tie_breaker_nodes is not valid. It must be 'lowest', 'highest' or a space-separated list of node IDs. auto_tie_breaker is disabled");
+ auto_tie_breaker = ATB_NONE;
+ }
+ LEAVE();
+}
+
static int check_qdevice_master(void)
{
struct cluster_node *node = NULL;
@@ -827,11 +999,12 @@ static void are_we_quorate(unsigned int total_votes)
} else {
quorate = 1;
get_lowest_node_id();
+ get_highest_node_id();
}
- if ((auto_tie_breaker) &&
+ if ((auto_tie_breaker != ATB_NONE) &&
(total_votes == (us->expected_votes / 2)) &&
- (check_low_node_id_partition() == 1)) {
+ (check_auto_tie_breaker() == 1)) {
quorate = 1;
}
@@ -1038,7 +1211,9 @@ static char *votequorum_readconfig(int runtime)
uint32_t node_votes = 0, qdevice_votes = 0;
uint32_t node_expected_votes = 0, expected_votes = 0;
uint32_t node_count = 0;
+ uint8_t atb;
int have_nodelist, have_qdevice;
+ char *atb_string;
char *error = NULL;
ENTER();
@@ -1098,10 +1273,24 @@ static char *votequorum_readconfig(int runtime)
icmap_get_uint8("quorum.allow_downscale", &allow_downscale);
icmap_get_uint8("quorum.wait_for_all", &wait_for_all);
- icmap_get_uint8("quorum.auto_tie_breaker", &auto_tie_breaker);
icmap_get_uint8("quorum.last_man_standing", &last_man_standing);
icmap_get_uint32("quorum.last_man_standing_window", &last_man_standing_window);
icmap_get_uint8("quorum.expected_votes_tracking", &ev_tracking);
+ icmap_get_uint8("quorum.auto_tie_breaker", &atb);
+ icmap_get_string("quorum.auto_tie_breaker_node", &atb_string);
+
+ if (!atb) {
+ auto_tie_breaker = ATB_NONE;
+ if (atb_string) {
+ log_printf(LOGSYS_LEVEL_WARNING,
+ "auto_tie_breaker_node: is meaningless if auto_tie_breaker is set to 0");
+ }
+ }
+
+ if (atb && atb_string) {
+ parse_atb_string(atb_string);
+ }
+ free(atb_string);
/* allow_downscale requires ev_tracking */
if (allow_downscale) {
@@ -1136,7 +1325,7 @@ static char *votequorum_readconfig(int runtime)
}
}
- if ((have_qdevice) && (auto_tie_breaker)) {
+ if ((have_qdevice) && (auto_tie_breaker != ATB_NONE)) {
if (!runtime) {
error = (char *)"configuration error: quorum.device is not compatible with auto_tie_breaker";
goto out;
@@ -2003,7 +2192,7 @@ static void votequorum_sync_init (
if (last_man_standing) {
if (((member_list_entries >= quorum) && (left_nodes)) ||
- ((member_list_entries <= quorum) && (auto_tie_breaker) && (check_low_node_id_partition() == 1))) {
+ ((member_list_entries <= quorum) && (auto_tie_breaker != ATB_NONE) && (check_low_node_id_partition() == 1))) {
if (last_man_standing_timer_set) {
corosync_api->timer_delete(last_man_standing_timer);
last_man_standing_timer_set = 0;
@@ -2015,6 +2204,9 @@ static void votequorum_sync_init (
}
}
+ memcpy(previous_quorum_members, quorum_members, sizeof(unsigned int) * quorum_members_entries);
+ previous_quorum_members_entries = quorum_members_entries;
+
memcpy(quorum_members, member_list, sizeof(unsigned int) * member_list_entries);
quorum_members_entries = member_list_entries;
memcpy(&quorum_ringid, ring_id, sizeof(*ring_id));
@@ -2205,7 +2397,7 @@ static void message_handler_req_lib_votequorum_getinfo (void *conn, const void *
if (last_man_standing) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_LAST_MAN_STANDING;
}
- if (auto_tie_breaker) {
+ if (auto_tie_breaker != ATB_NONE) {
res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_AUTO_TIE_BREAKER;
}
if (allow_downscale) {
diff --git a/man/votequorum.5 b/man/votequorum.5
index ebd2852..7770fb0 100644
--- a/man/votequorum.5
+++ b/man/votequorum.5
@@ -256,9 +256,21 @@ The general behaviour of votequorum allows a simultaneous node failure up
to 50% - 1 node, assuming each node has 1 vote.
.PP
When ATB is enabled, the cluster can suffer up to 50% of the nodes failing
-at the same time, in a deterministic fashion. The cluster partition, or the
-set of nodes that are still in contact with the node that has the lowest
-nodeid will remain quorate. The other nodes will be inquorate.
+at the same time, in a deterministic fashion. By default the cluster
+partition, or the set of nodes that are still in contact with the
+node that has the lowest nodeid will remain quorate. The other nodes will
+be inquorate. This behaviour can be changed by also specifying
+.PP
+.B auto_tie_breaker_node: lowest|highest|<list of node IDs>
+.PP
+\'lowest' is the default, 'highest' is similar in that if the current set of
+nodes contains the highest nodeid then it will remain quorate. Alternatively
+it is possible to specify a particular node ID or list of node IDs that will
+be required to maintain quorum. If a (space-separated) list is given, the
+nodes are evaluated in order, so if the first node is present then it will
+be used to determine the quorate partition, if that node is not in either
+half (ie was not in the cluster before the split) then the second node ID
+will be checked for and so on.
.PP
Example configuration 1:
.nf
@@ -267,8 +279,19 @@ quorum {
provider: corosync_votequorum
expected_votes: 8
auto_tie_breaker: 1
+ auto_tie_breaker_node: lowest
}
-
+.fi
+.PP
+Example configuration 2:
+.nf
+quorum {
+ provider: corosync_votequorum
+ expected_votes: 8
+ auto_tie_breaker: 1
+ auto_tie_breaker_node: 1 3 5
+}
+.PP
.fi
.PP
.B allow_downscale: 1
_______________________________________________
discuss mailing list
discuss@xxxxxxxxxxxx
http://lists.corosync.org/mailman/listinfo/discuss