On Fri, 2005-07-08 at 11:56 -0400, Lon Hohberger wrote: > > Here is a snippet of my config file showing the resources and the nfs > > service. Of the listed exports, the first one for users01 and all 3 for > > iftscratch don't seem to get status checks, and thus never came up again. > > Ok, now we're getting somewhere... That definitely sounds like a bug. > Want to take this to bugzilla? > Well, I was able to track it down, it's being caused by the throttle on the monitor operations for resources. Basically, any time a shared resource is referenced more than once, it will not get monitored for the 2nd+ time it's referenced. This is because it keeps track of the last time the resource was checked at the resource level, and if it hasn't been more time than the amount of time the monitor attribute says is the interval, it doesn't run the monitor operation on it. So here's a patch that seems to fix it in my quick testing, but I'm not sure if it's the best way to fix the bug. It copies the action list for the resource to the resource_node when a resource is referenced. It then uses that copy of the action list when doing status checks. Perhaps a better way would be to make a copy of the struct for the shared resource_t any time it's referenced, rather than just using the same one for all resource_node_t. I'm willing to write up this patch if you think it's a better course of action. Thanks, Eric Kerin <eric@xxxxxxxxxxx>
Index: rgmanager/include/reslist.h =================================================================== RCS file: /cvs/cluster/cluster/rgmanager/include/reslist.h,v retrieving revision 1.8.2.3 diff -u -r1.8.2.3 reslist.h --- rgmanager/include/reslist.h 21 Mar 2005 22:01:30 -0000 1.8.2.3 +++ rgmanager/include/reslist.h 8 Jul 2005 19:29:15 -0000 @@ -126,6 +126,7 @@ resource_t *rn_resource; int rn_state; /* State of this instance of rn_resource */ int rn_flags; + resource_act_t * rn_actions; } resource_node_t; typedef struct _fod_node { Index: rgmanager/src/daemons/restree.c =================================================================== RCS file: /cvs/cluster/cluster/rgmanager/src/daemons/restree.c,v retrieving revision 1.10.2.4 diff -u -r1.10.2.4 restree.c --- rgmanager/src/daemons/restree.c 9 May 2005 20:19:19 -0000 1.10.2.4 +++ rgmanager/src/daemons/restree.c 8 Jul 2005 19:29:15 -0000 @@ -39,6 +39,10 @@ void * __attribute__((unused))ret, int op); void print_env(char **env); +/* XXX from reslist.c */ +void * act_dup(resource_act_t *acts); + + const char *res_ops[] = { "start", "stop", @@ -487,6 +491,7 @@ node->rn_parent = parent; node->rn_resource = curres; node->rn_state = RES_STOPPED; + node->rn_actions = (resource_act_t *)act_dup(curres->r_actions); curres->r_refs++; list_insert(tree, node); @@ -587,6 +592,9 @@ destroy_resource_tree(&(*tree)->rn_child); list_remove(tree, node); + if(node->rn_actions){ + free(node->rn_actions); + } free(node); } } @@ -742,29 +750,28 @@ int x = 0, idx = -1; int has_recover = 0; time_t delta = 0, now = 0; - resource_t *res = node->rn_resource; now = time(NULL); - for (; res->r_actions[x].ra_name; x++) { + for (; node->rn_actions[x].ra_name; x++) { if (!has_recover && - !strcmp(res->r_actions[x].ra_name, "recover")) { + !strcmp(node->rn_actions[x].ra_name, "recover")) { has_recover = 1; continue; } - if (strcmp(res->r_actions[x].ra_name, "status")) + if (strcmp(node->rn_actions[x].ra_name, "status")) continue; - delta = now - res->r_actions[x].ra_last; + delta = now - node->rn_actions[x].ra_last; /* Ok, it's a 'monitor' action. See if enough time has elapsed for a given type of monitoring action */ - if (delta < res->r_actions[x].ra_interval) + if (delta < node->rn_actions[x].ra_interval) continue; - + if (idx == -1 || - res->r_actions[x].ra_depth > res->r_actions[idx].ra_depth) + node->rn_actions[x].ra_depth > node->rn_actions[idx].ra_depth) idx = x; } @@ -772,9 +779,9 @@ if (idx == -1) return 0; - res->r_actions[idx].ra_last = now; + node->rn_actions[idx].ra_last = now; if ((x = res_exec(node, RS_STATUS, - res->r_actions[idx].ra_depth)) == 0) + node->rn_actions[idx].ra_depth)) == 0) return 0; if (!has_recover) @@ -797,13 +804,13 @@ now = res->r_started; - for (; res->r_actions[x].ra_name; x++) { + for (; node->rn_actions[x].ra_name; x++) { - if (strcmp(res->r_actions[x].ra_name, "monitor") && - strcmp(res->r_actions[x].ra_name, "status")) + if (strcmp(node->rn_actions[x].ra_name, "monitor") && + strcmp(node->rn_actions[x].ra_name, "status")) continue; - res->r_actions[x].ra_last = now; + node->rn_actions[x].ra_last = now; } }
-- Linux-cluster@xxxxxxxxxx http://www.redhat.com/mailman/listinfo/linux-cluster