After we use skiplist instead of hashtalbe, the new code does not need CPG_GROUPS_MAX macro. I will remove CPG_GROUPS_MAX from this patch to reduce undesirable impact to others. On Fri, Oct 28, 2011 at 2:22 PM, Yunkai Zhang <qiushu.zyk@xxxxxxxxxx> wrote: > Thanks for Steven. > I will update this patch reflecting our discussion after I completed testing. > > On Fri, Oct 28, 2011 at 1:57 PM, Steven Dake <sdake@xxxxxxxxxx> wrote: >> On 10/27/2011 07:51 PM, Yunkai Zhang wrote: >>> On Fri, Oct 28, 2011 at 9:23 AM, Angus Salkeld <asalkeld@xxxxxxxxxx> wrote: >>>> Thanks for the patch! Some comments inline. >>>> >>>> -Angus >>>> >>>> On Fri, Oct 28, 2011 at 12:22:13AM +0800, Yunkai Zhang wrote: >>>>> We found that sheepdog will receive more than one confchg msg when >>>>> network partition occur. For example, suppose the cluster has 4 >>>>> nodes: N1, N2, N3, N4, and they form a single-ring initially. After a >>>>> while, network partition occur, the single-ring divide into two >>>>> sub-ring: ring(N1, N2, N3) and ring(N4). The sheepdog in the ring(N4) >>>>> will receive the following confchg messages in turn: >>>>> Memb: N2,N3,N4 Left:N1 Joined:null >>>>> memb: N3,N4 Left:N2 Joined:null >>>>> memb: N4 Left:N3 Joined:null >>>>> >>>>> This patch will fixed this bug, and the client will only receive one >>>>> confchg event in this case: >>>>> memb: N4 Left:N2,N3,N4 Joined:null >>>>> >>>>> Signed-off-by: Yunkai Zhang <qiushu.zyk@xxxxxxxxxx> >>>>> --- >>>>> configure.ac | 2 +- >>>>> include/corosync/cpg.h | 1 + >>>>> services/cpg.c | 97 +++++++++++++++++++++++++++++++++++++++++------- >>>>> 3 files changed, 85 insertions(+), 15 deletions(-) >>>>> >>>>> diff --git a/configure.ac b/configure.ac >>>>> index 563f799..aaed478 100644 >>>>> --- a/configure.ac >>>>> +++ b/configure.ac >>>>> @@ -88,7 +88,7 @@ AC_HEADER_SYS_WAIT >>>>> AC_CHECK_HEADERS([arpa/inet.h fcntl.h limits.h netdb.h netinet/in.h stdint.h \ >>>>> stdlib.h string.h sys/ioctl.h sys/param.h sys/socket.h \ >>>>> sys/time.h syslog.h unistd.h sys/types.h getopt.h malloc.h \ >>>>> - sys/sockio.h utmpx.h]) >>>>> + sys/sockio.h utmpx.h search.h]) >>>> >>>> Why this? it's not included anywhere in corosync. >>>> >>> >>> Sorry, I forgot to clear it. >>> >>> I wrote the first version of this patch base on v1.4.1 which not >>> provide libqb and I used hash table api comes from search.h instead of >>> qbmap.h. >>> >>>>> >>>>> # Checks for typedefs, structures, and compiler characteristics. >>>>> AC_C_CONST >>>>> diff --git a/include/corosync/cpg.h b/include/corosync/cpg.h >>>>> index f7956fa..9861c5a 100644 >>>>> --- a/include/corosync/cpg.h >>>>> +++ b/include/corosync/cpg.h >>>>> @@ -95,6 +95,7 @@ struct cpg_name { >>>>> }; >>>>> >>>>> #define CPG_MEMBERS_MAX 128 >>>>> +#define CPG_GROUPS_MAX 64 >>>> >> >> Go with 8192 groups - we have some use cases where many groups are >> needed - memory consumption shouldn't be much of an issue. >> >>>> I think this is ok, though not 100% sure. >>>> >>>>> >>>>> struct cpg_iteration_description_t { >>>>> struct cpg_name group; >>>>> diff --git a/services/cpg.c b/services/cpg.c >>>>> index caec097..dc72c84 100644 >>>>> --- a/services/cpg.c >>>>> +++ b/services/cpg.c >>>>> @@ -55,6 +55,7 @@ >>>>> #include <netinet/in.h> >>>>> #include <arpa/inet.h> >>>>> #include <sys/mman.h> >>>>> +#include <qb/qbmap.h> >>>>> >>>>> #include <corosync/corotypes.h> >>>>> #include <qb/qbipc_common.h> >>>>> @@ -705,7 +706,6 @@ static int notify_lib_joinlist( >>>>> for (iter = cpg_pd_list_head.next; iter != &cpg_pd_list_head; iter = iter->next) { >>>>> struct cpg_pd *cpd = list_entry (iter, struct cpg_pd, list); >>>>> if (mar_name_compare (&cpd->group_name, group_name) == 0) { >>>>> - assert (left_list_entries <= 1); >>>>> assert (joined_list_entries <= 1); >>>>> if (joined_list_entries) { >>>>> if (joined_list[0].pid == cpd->pid && >>>>> @@ -798,8 +798,18 @@ static void downlist_master_choose_and_send (void) >>>>> { >>>>> struct downlist_msg *stored_msg; >>>>> struct list_head *iter; >>>>> - mar_cpg_address_t left_list; >>>>> - int i; >>>>> + struct process_info *left_pi; >>>>> + struct qb_map_t *group_htab; >>>>> + struct cpg_name cpg_group; >>>>> + mar_cpg_name_t group; >>>>> + struct confchg_data{ >>>>> + struct cpg_name cpg_group; >>>>> + mar_cpg_address_t left_list[CPG_MEMBERS_MAX]; >>>>> + int left_list_entries; >>>>> + struct list_head list; >>>>> + } *pcd; >>>>> + DECLARE_LIST_INIT(confchg_data_list_head); >>>>> + int i, size; >>>> >>>> Why the list? You could use qb_skiplist_create() and use the >>>> iterator. >>>> >>>> miter = qb_map_iter_create(group_map); >>>> while ((p = qb_map_iter_next(miter, (void**)&pcd))) { >>>> ... >>>> } >>>> >>>> Though using libqb here is going to make it impossible to cherry-pick >>>> the patch into the flatiron branch (Steven/Honzaf would this be needed >>>> there?). >>>> >>> >> >> agree use skiplist >> >>> I worry about the same thing for my project using v1.4.1 now. >>> >>> Does Steven/Honza give some advice? >>> >> >> For an invasive patch like this, what would be ideal is a master version >> and a separate patch for flatiron. Typically the branch maintainer >> (Honza) would cherry-pick simple bug fixes, but this is major rework and >> warrants a separate patch. It must go into master prior to flatiron to >> avoid behavioural regressions when moving forward in versions. >> >>>>> >>>>> downlist_state = CPG_DOWNLIST_APPLYING; >>>>> >>>>> @@ -810,27 +820,86 @@ static void downlist_master_choose_and_send (void) >>>>> } >>>>> downlist_log("chosen downlist", stored_msg); >>>>> >>>>> - /* send events */ >>>>> + group_htab = qb_hashtable_create(CPG_GROUPS_MAX); >>>>> + >>>>> + /* >>>>> + * only the cpg groups included in left nodes should receive >>>>> + * confchg event, so we will collect these cpg groups and >>>>> + * relative left_lists here >>>>> + */ >>>>> for (iter = process_info_list_head.next; iter != &process_info_list_head; ) { >>>>> struct process_info *pi = list_entry(iter, struct process_info, list); >>>>> iter = iter->next; >>>>> >>>>> + left_pi = NULL; >>>>> for (i = 0; i < stored_msg->left_nodes; i++) { >>>>> + >>>>> if (pi->nodeid == stored_msg->nodeids[i]) { >>>>> - left_list.nodeid = pi->nodeid; >>>>> - left_list.pid = pi->pid; >>>>> - left_list.reason = CONFCHG_CPG_REASON_NODEDOWN; >>>>> - >>>>> - notify_lib_joinlist(&pi->group, NULL, >>>>> - 0, NULL, >>>>> - 1, &left_list, >>>>> - MESSAGE_RES_CPG_CONFCHG_CALLBACK); >>>>> - list_del (&pi->list); >>>>> - free (pi); >>>>> + left_pi = pi; >>>>> break; >>>>> } >>>>> } >>>>> + >>>>> + if (left_pi) { >>>>> + marshall_from_mar_cpg_name_t(&cpg_group, &left_pi->group); >>>>> + log_printf (LOG_DEBUG, "cpg group name:%s", cpg_group.value); >>>>> + cpg_group.value[cpg_group.length] = 0; >>>>> + >>>>> + pcd = (struct confchg_data *)qb_map_get(group_htab, >>>>> + cpg_group.value); >>>>> + if (pcd != NULL) { >>>>> + size = pcd->left_list_entries; >>>>> + pcd->left_list[size].nodeid = left_pi->nodeid; >>>>> + pcd->left_list[size].pid = left_pi->pid; >>>>> + pcd->left_list[size].reason = CONFCHG_CPG_REASON_NODEDOWN; >>>>> + pcd->left_list_entries++; >>>>> + }else { >>>>> + pcd = (struct confchg_data *)malloc(sizeof(struct confchg_data)); >>>>> + memset(pcd, 0, sizeof(struct confchg_data)); >>>>> + >>>>> + memcpy(&pcd->cpg_group, &cpg_group, sizeof(struct cpg_name)); >>>>> + pcd->left_list[0].nodeid = left_pi->nodeid; >>>>> + pcd->left_list[0].pid = left_pi->pid; >>>>> + pcd->left_list[0].reason = CONFCHG_CPG_REASON_NODEDOWN; >>>>> + pcd->left_list_entries = 1; >>>>> + >>>>> + qb_map_put(group_htab, pcd->cpg_group.value, pcd); >>>>> + >>>>> + list_init(&pcd->list); >>>>> + list_add(&pcd->list, &confchg_data_list_head); >>>>> + } >>>> >>>> This could be a bit simplified: >>>> >>>> pcd = (struct confchg_data *)qb_map_get(group_map, >>>> cpg_group.value); >>>> if (pcd == NULL) { >>>> pcd = (struct confchg_data *)calloc(1, sizeof(struct confchg_data)); >>>> memcpy(&pcd->cpg_group, &cpg_group, sizeof(struct cpg_name)); >>>> qb_map_put(group_map, pcd->cpg_group.value, pcd); >>>> } >>>> size = pcd->left_list_entries; >>>> pcd->left_list[size].nodeid = left_pi->nodeid; >>>> pcd->left_list[size].pid = left_pi->pid; >>>> pcd->left_list[size].reason = CONFCHG_CPG_REASON_NODEDOWN; >>>> pcd->left_list_entries++; >>>> list_del (&left_pi->list); >>>> free (left_pi); >>> >>> Thanks, It looks cool! >>> >>>> >>>>> + >>>>> + list_del (&left_pi->list); >>>>> + free (left_pi); >>>>> + } >>>>> + } >>>>> + >>>>> + /* send only one confchg event per cpg group */ >>>>> + for (iter = confchg_data_list_head.next; iter != &confchg_data_list_head; ) { >>>>> + pcd = list_entry(iter, struct confchg_data, list); >>>>> + iter = iter->next; >>>>> + >>>>> + marshall_to_mar_cpg_name_t(&group, &pcd->cpg_group); >>>>> + >>>>> + log_printf (LOG_DEBUG, "left_list_entries:%d", pcd->left_list_entries); >>>>> + for (i=0; i<pcd->left_list_entries; i++) { >>>>> + log_printf (LOG_DEBUG, "left_list[%d] group:%d, ip:%s, pid:%d", >>>>> + i, pcd->cpg_group.value, >>>>> + (char*)api->totem_ifaces_print(pcd->left_list[i].nodeid), >>>>> + pcd->left_list[i].pid); >>>>> + } >>>>> + >>>>> + /* send confchg event */ >>>>> + notify_lib_joinlist(&group, NULL, >>>>> + 0, NULL, >>>>> + pcd->left_list_entries, >>>>> + pcd->left_list, >>>>> + MESSAGE_RES_CPG_CONFCHG_CALLBACK); >>>>> + >>>>> + free(pcd); >>>>> } >>>>> + >>>>> + qb_map_destroy(group_htab); >>>>> } >>>>> >>>>> static void downlist_messages_delete (void) >>>>> -- >>>>> 1.7.6.4 >>>> _______________________________________________ >>>> discuss mailing list >>>> discuss@xxxxxxxxxxxx >>>> http://lists.corosync.org/mailman/listinfo/discuss >>>> >>> >>> >>> >> >> _______________________________________________ >> discuss mailing list >> discuss@xxxxxxxxxxxx >> http://lists.corosync.org/mailman/listinfo/discuss >> > > > > -- > Yunkai Zhang > Work at Taobao > -- Yunkai Zhang Work at Taobao _______________________________________________ discuss mailing list discuss@xxxxxxxxxxxx http://lists.corosync.org/mailman/listinfo/discuss