Re: [PATCH] Send one confchg event per CPG group to CPG client

Yunkai Zhang <qiushu.zyk@xxxxxxxxxx> · Fri, 28 Oct 2011 14:38:39 +0800

After we use skiplist instead of hashtalbe, the new code does not need
CPG_GROUPS_MAX macro.

I will remove CPG_GROUPS_MAX from this patch to reduce undesirable
impact to others.

On Fri, Oct 28, 2011 at 2:22 PM, Yunkai Zhang <qiushu.zyk@xxxxxxxxxx> wrote:
> Thanks for Steven.
> I will update this patch reflecting our discussion after I completed testing.
>
> On Fri, Oct 28, 2011 at 1:57 PM, Steven Dake <sdake@xxxxxxxxxx> wrote:
>> On 10/27/2011 07:51 PM, Yunkai Zhang wrote:
>>> On Fri, Oct 28, 2011 at 9:23 AM, Angus Salkeld <asalkeld@xxxxxxxxxx> wrote:
>>>> Thanks for the patch! Some comments inline.
>>>>
>>>> -Angus
>>>>
>>>> On Fri, Oct 28, 2011 at 12:22:13AM +0800, Yunkai Zhang wrote:
>>>>> We found that sheepdog will receive more than one confchg msg when
>>>>> network partition occur.  For example, suppose the cluster has 4
>>>>> nodes: N1, N2, N3, N4,  and they form a single-ring initially. After a
>>>>> while, network partition occur, the single-ring divide into two
>>>>> sub-ring: ring(N1, N2, N3) and ring(N4). The sheepdog in the ring(N4)
>>>>> will receive the following confchg messages in turn:
>>>>> Memb: N2,N3,N4  Left:N1         Joined:null
>>>>> memb: N3,N4     Left:N2         Joined:null
>>>>> memb: N4        Left:N3         Joined:null
>>>>>
>>>>> This patch will fixed this bug, and the client will only receive one
>>>>> confchg event in this case:
>>>>> memb: N4        Left:N2,N3,N4   Joined:null
>>>>>
>>>>> Signed-off-by: Yunkai Zhang <qiushu.zyk@xxxxxxxxxx>
>>>>> ---
>>>>>  configure.ac           |    2 +-
>>>>>  include/corosync/cpg.h |    1 +
>>>>>  services/cpg.c         |   97 +++++++++++++++++++++++++++++++++++++++++-------
>>>>>  3 files changed, 85 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/configure.ac b/configure.ac
>>>>> index 563f799..aaed478 100644
>>>>> --- a/configure.ac
>>>>> +++ b/configure.ac
>>>>> @@ -88,7 +88,7 @@ AC_HEADER_SYS_WAIT
>>>>>  AC_CHECK_HEADERS([arpa/inet.h fcntl.h limits.h netdb.h netinet/in.h stdint.h \
>>>>>                 stdlib.h string.h sys/ioctl.h sys/param.h sys/socket.h \
>>>>>                 sys/time.h syslog.h unistd.h sys/types.h getopt.h malloc.h \
>>>>> -               sys/sockio.h utmpx.h])
>>>>> +               sys/sockio.h utmpx.h search.h])
>>>>
>>>> Why this? it's not included anywhere in corosync.
>>>>
>>>
>>> Sorry, I forgot to clear it.
>>>
>>> I wrote the first version of this patch base on v1.4.1 which not
>>> provide libqb and I used hash table api comes from search.h instead of
>>> qbmap.h.
>>>
>>>>>
>>>>>  # Checks for typedefs, structures, and compiler characteristics.
>>>>>  AC_C_CONST
>>>>> diff --git a/include/corosync/cpg.h b/include/corosync/cpg.h
>>>>> index f7956fa..9861c5a 100644
>>>>> --- a/include/corosync/cpg.h
>>>>> +++ b/include/corosync/cpg.h
>>>>> @@ -95,6 +95,7 @@ struct cpg_name {
>>>>>  };
>>>>>
>>>>>  #define CPG_MEMBERS_MAX 128
>>>>> +#define CPG_GROUPS_MAX 64
>>>>
>>
>> Go with 8192 groups - we have some use cases where many groups are
>> needed - memory consumption shouldn't be much of an issue.
>>
>>>> I think this is ok, though not 100% sure.
>>>>
>>>>>
>>>>>  struct cpg_iteration_description_t {
>>>>>       struct cpg_name group;
>>>>> diff --git a/services/cpg.c b/services/cpg.c
>>>>> index caec097..dc72c84 100644
>>>>> --- a/services/cpg.c
>>>>> +++ b/services/cpg.c
>>>>> @@ -55,6 +55,7 @@
>>>>>  #include <netinet/in.h>
>>>>>  #include <arpa/inet.h>
>>>>>  #include <sys/mman.h>
>>>>> +#include <qb/qbmap.h>
>>>>>
>>>>>  #include <corosync/corotypes.h>
>>>>>  #include <qb/qbipc_common.h>
>>>>> @@ -705,7 +706,6 @@ static int notify_lib_joinlist(
>>>>>               for (iter = cpg_pd_list_head.next; iter != &cpg_pd_list_head; iter = iter->next) {
>>>>>                       struct cpg_pd *cpd = list_entry (iter, struct cpg_pd, list);
>>>>>                       if (mar_name_compare (&cpd->group_name, group_name) == 0) {
>>>>> -                             assert (left_list_entries <= 1);
>>>>>                               assert (joined_list_entries <= 1);
>>>>>                               if (joined_list_entries) {
>>>>>                                       if (joined_list[0].pid == cpd->pid &&
>>>>> @@ -798,8 +798,18 @@ static void downlist_master_choose_and_send (void)
>>>>>  {
>>>>>       struct downlist_msg *stored_msg;
>>>>>       struct list_head *iter;
>>>>> -     mar_cpg_address_t left_list;
>>>>> -     int i;
>>>>> +     struct process_info *left_pi;
>>>>> +     struct qb_map_t *group_htab;
>>>>> +     struct cpg_name cpg_group;
>>>>> +     mar_cpg_name_t group;
>>>>> +     struct confchg_data{
>>>>> +             struct cpg_name cpg_group;
>>>>> +             mar_cpg_address_t left_list[CPG_MEMBERS_MAX];
>>>>> +             int left_list_entries;
>>>>> +             struct list_head  list;
>>>>> +     } *pcd;
>>>>> +     DECLARE_LIST_INIT(confchg_data_list_head);
>>>>> +     int i, size;
>>>>
>>>> Why the list? You could use qb_skiplist_create() and use the
>>>> iterator.
>>>>
>>>> miter = qb_map_iter_create(group_map);
>>>> while ((p = qb_map_iter_next(miter, (void**)&pcd))) {
>>>>  ...
>>>> }
>>>>
>>>> Though using libqb here is going to make it impossible to cherry-pick
>>>> the patch into the flatiron branch (Steven/Honzaf would this be needed
>>>> there?).
>>>>
>>>
>>
>> agree use skiplist
>>
>>> I worry about the same thing for my project using v1.4.1 now.
>>>
>>> Does Steven/Honza give some advice?
>>>
>>
>> For an invasive patch like this, what would be ideal is a master version
>> and a separate patch for flatiron.  Typically the branch maintainer
>> (Honza) would cherry-pick simple bug fixes, but this is major rework and
>> warrants a separate patch.  It must go into master prior to flatiron to
>> avoid behavioural regressions when moving forward in versions.
>>
>>>>>
>>>>>       downlist_state = CPG_DOWNLIST_APPLYING;
>>>>>
>>>>> @@ -810,27 +820,86 @@ static void downlist_master_choose_and_send (void)
>>>>>       }
>>>>>       downlist_log("chosen downlist", stored_msg);
>>>>>
>>>>> -     /* send events */
>>>>> +     group_htab = qb_hashtable_create(CPG_GROUPS_MAX);
>>>>> +
>>>>> +        /*
>>>>> +      * only the cpg groups included in left nodes should receive
>>>>> +      * confchg event, so we will collect these cpg groups and
>>>>> +      * relative left_lists here
>>>>> +      */
>>>>>       for (iter = process_info_list_head.next; iter != &process_info_list_head; ) {
>>>>>               struct process_info *pi = list_entry(iter, struct process_info, list);
>>>>>               iter = iter->next;
>>>>>
>>>>> +             left_pi = NULL;
>>>>>               for (i = 0; i < stored_msg->left_nodes; i++) {
>>>>> +
>>>>>                       if (pi->nodeid == stored_msg->nodeids[i]) {
>>>>> -                             left_list.nodeid = pi->nodeid;
>>>>> -                             left_list.pid = pi->pid;
>>>>> -                             left_list.reason = CONFCHG_CPG_REASON_NODEDOWN;
>>>>> -
>>>>> -                             notify_lib_joinlist(&pi->group, NULL,
>>>>> -                                     0, NULL,
>>>>> -                                     1, &left_list,
>>>>> -                                     MESSAGE_RES_CPG_CONFCHG_CALLBACK);
>>>>> -                             list_del (&pi->list);
>>>>> -                             free (pi);
>>>>> +                             left_pi = pi;
>>>>>                               break;
>>>>>                       }
>>>>>               }
>>>>> +
>>>>> +             if (left_pi) {
>>>>> +                     marshall_from_mar_cpg_name_t(&cpg_group, &left_pi->group);
>>>>> +                     log_printf (LOG_DEBUG, "cpg group name:%s", cpg_group.value);
>>>>> +                     cpg_group.value[cpg_group.length] = 0;
>>>>> +
>>>>> +                     pcd = (struct confchg_data *)qb_map_get(group_htab,
>>>>> +                                     cpg_group.value);
>>>>> +                     if (pcd != NULL) {
>>>>> +                             size = pcd->left_list_entries;
>>>>> +                             pcd->left_list[size].nodeid = left_pi->nodeid;
>>>>> +                             pcd->left_list[size].pid = left_pi->pid;
>>>>> +                             pcd->left_list[size].reason = CONFCHG_CPG_REASON_NODEDOWN;
>>>>> +                             pcd->left_list_entries++;
>>>>> +                     }else {
>>>>> +                             pcd = (struct confchg_data *)malloc(sizeof(struct confchg_data));
>>>>> +                             memset(pcd, 0, sizeof(struct confchg_data));
>>>>> +
>>>>> +                             memcpy(&pcd->cpg_group, &cpg_group, sizeof(struct cpg_name));
>>>>> +                             pcd->left_list[0].nodeid = left_pi->nodeid;
>>>>> +                             pcd->left_list[0].pid = left_pi->pid;
>>>>> +                             pcd->left_list[0].reason = CONFCHG_CPG_REASON_NODEDOWN;
>>>>> +                             pcd->left_list_entries = 1;
>>>>> +
>>>>> +                             qb_map_put(group_htab, pcd->cpg_group.value, pcd);
>>>>> +
>>>>> +                             list_init(&pcd->list);
>>>>> +                             list_add(&pcd->list, &confchg_data_list_head);
>>>>> +                     }
>>>>
>>>> This could be a bit simplified:
>>>>
>>>>                        pcd = (struct confchg_data *)qb_map_get(group_map,
>>>>                                        cpg_group.value);
>>>>                        if (pcd == NULL) {
>>>>                                pcd = (struct confchg_data *)calloc(1, sizeof(struct confchg_data));
>>>>                                memcpy(&pcd->cpg_group, &cpg_group, sizeof(struct cpg_name));
>>>>                                qb_map_put(group_map, pcd->cpg_group.value, pcd);
>>>>                        }
>>>>                        size = pcd->left_list_entries;
>>>>                        pcd->left_list[size].nodeid = left_pi->nodeid;
>>>>                        pcd->left_list[size].pid = left_pi->pid;
>>>>                        pcd->left_list[size].reason = CONFCHG_CPG_REASON_NODEDOWN;
>>>>                        pcd->left_list_entries++;
>>>>                        list_del (&left_pi->list);
>>>>                        free (left_pi);
>>>
>>> Thanks, It looks cool!
>>>
>>>>
>>>>> +
>>>>> +                     list_del (&left_pi->list);
>>>>> +                     free (left_pi);
>>>>> +             }
>>>>> +     }
>>>>> +
>>>>> +     /* send only one confchg event per cpg group */
>>>>> +     for (iter = confchg_data_list_head.next; iter != &confchg_data_list_head; ) {
>>>>> +             pcd = list_entry(iter, struct confchg_data, list);
>>>>> +             iter = iter->next;
>>>>> +
>>>>> +             marshall_to_mar_cpg_name_t(&group, &pcd->cpg_group);
>>>>> +
>>>>> +             log_printf (LOG_DEBUG, "left_list_entries:%d", pcd->left_list_entries);
>>>>> +             for (i=0; i<pcd->left_list_entries; i++) {
>>>>> +                     log_printf (LOG_DEBUG, "left_list[%d] group:%d, ip:%s, pid:%d",
>>>>> +                             i, pcd->cpg_group.value,
>>>>> +                             (char*)api->totem_ifaces_print(pcd->left_list[i].nodeid),
>>>>> +                             pcd->left_list[i].pid);
>>>>> +             }
>>>>> +
>>>>> +             /* send confchg event */
>>>>> +             notify_lib_joinlist(&group, NULL,
>>>>> +                     0, NULL,
>>>>> +                     pcd->left_list_entries,
>>>>> +                     pcd->left_list,
>>>>> +                     MESSAGE_RES_CPG_CONFCHG_CALLBACK);
>>>>> +
>>>>> +             free(pcd);
>>>>>       }
>>>>> +
>>>>> +     qb_map_destroy(group_htab);
>>>>>  }
>>>>>
>>>>>  static void downlist_messages_delete (void)
>>>>> --
>>>>> 1.7.6.4
>>>> _______________________________________________
>>>> discuss mailing list
>>>> discuss@xxxxxxxxxxxx
>>>> http://lists.corosync.org/mailman/listinfo/discuss
>>>>
>>>
>>>
>>>
>>
>> _______________________________________________
>> discuss mailing list
>> discuss@xxxxxxxxxxxx
>> http://lists.corosync.org/mailman/listinfo/discuss
>>
>
>
>
> --
> Yunkai Zhang
> Work at Taobao
>

-- 
Yunkai Zhang
Work at Taobao
_______________________________________________
discuss mailing list
discuss@xxxxxxxxxxxx
http://lists.corosync.org/mailman/listinfo/discuss