[Patch 1/3] tabled: Employ sesible start-up retries

Pete Zaitcev <zaitcev@xxxxxxxxxx> · Fri, 28 Aug 2009 20:41:53 -0600

With this patch, daemons that tabled needs may come up at any time
relatively to each other and tabled should be able to initialize itself.
To this end, we remove the extra "sleep" command, now unnecessary.
Unfortunately, one sleep is still needed for our build tests for
reasons unrelated to ability of tabled to bootstrap itself.

During testing, I added sleeps into various places and rearranged
the order of daemons to test how it all works. There are limitations,
certainly, but the system looks robust within reason.

Next step would be to process Chunk and CLD going down, not up.

Signed-off-by: Pete Zaitcev <zaitcev@xxxxxxxxxx>

diff --git a/server/cldu.c b/server/cldu.c
index b275234..706faeb 100644
--- a/server/cldu.c
+++ b/server/cldu.c
@@ -34,9 +34,17 @@ struct cld_session {
 	bool forced_hosts;		/* Administrator overrode default CLD */
 	bool sess_open;
 	struct cldc_udp *lib;		/* library state */
-	struct event tm;
 	int retry_cnt;
 
+	/*
+	 * For code sanity and being isomorphic with conventional programming
+	 * using sleep(), neither of the timers must ever be active simultane-
+	 * ously with any other. But using one timer structure is too annoying.
+	 */
+	struct event tm_retry;
+	struct event tm_rescan;
+	struct event tm_reopen;
+
 	int actx;		/* Active host cldv[actx] */
 	struct cld_host cldv[N_CLD];
 
@@ -74,6 +82,8 @@ static void add_remote(const char *name);
 static void add_chunk_node(struct cld_session *sp, const char *name);
 
 static struct timeval cldu_retry_delay = { 5, 0 };
+static struct timeval cldu_rescan_delay = { 50, 0 };
+static struct timeval cldu_reopen_delay = { 3, 0 };
 
 /*
  * Identify the next host to be tried.
@@ -132,15 +142,40 @@ err_oom:
 	return 0;
 }
 
-static void cldu_timer(int fd, short events, void *userdata)
+static void cldu_tm_retry(int fd, short events, void *userdata)
 {
 	struct cld_session *sp = userdata;
 
+	if (++sp->retry_cnt >= 5) {
+		applog(LOG_INFO, "Out of retries for %s, bailing", sp->xfname);
+		exit(1);
+	}
 	if (debugging)
-		applog(LOG_DEBUG, "Trying to open %s\n", sp->xfname);
+		applog(LOG_DEBUG, "Trying to open %s", sp->xfname);
 	try_open_x(sp);
 }
 
+static void cldu_tm_rescan(int fd, short events, void *userdata)
+{
+	struct cld_session *sp = userdata;
+
+	/* Add rescanning for tabled nodes as well. FIXME */
+	if (debugging)
+		applog(LOG_DEBUG, "Rescanning for Chunks in %s", sp->xfname);
+	try_open_x(sp);
+}
+
+static void cldu_tm_reopen(int fd, short events, void *userdata)
+{
+	struct cld_session *sp = userdata;
+
+	if (debugging)
+		applog(LOG_DEBUG, "Trying to reopen %d storage nodes",
+		       tabled_srv.num_stor);
+	if (stor_update_cb() < 1)
+		evtimer_add(&sp->tm_reopen, &cldu_reopen_delay);
+}
+
 static void cldu_event(int fd, short events, void *userdata)
 {
 	struct cld_session *sp = userdata;
@@ -508,10 +543,19 @@ static int cldu_get_1_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	}
 
 	/*
+	 * If configuration gives us storage nodes, we shortcut scanning
+	 * of CLD, because:
+	 *  - the scanning may fail, and we should not care
+	 *  - NIDs for configured nodes are auto-assigned and may conflict
 	 * This will go away with the demise of <StorageNode>.
 	 */
 	if (tabled_srv.num_stor) {
-		stor_update_cb();
+		if (debugging)
+			applog(LOG_DEBUG, "Trying to open %d storage nodes",
+			       tabled_srv.num_stor);
+		if (stor_update_cb() < 1) {
+			evtimer_add(&sp->tm_reopen, &cldu_reopen_delay);
+		}
 		return 0;
 	}
 
@@ -528,11 +572,6 @@ static void try_open_x(struct cld_session *sp)
 	struct cldc_call_opts copts;
 	int rc;
 
-	if (++sp->retry_cnt >= 5) {
-		applog(LOG_INFO, "Out of retries for %s, bailing", sp->xfname);
-		exit(1);
-	}
-
 	memset(&copts, 0, sizeof(copts));
 	copts.cb = cldu_open_x_cb;
 	copts.private = sp;
@@ -551,13 +590,14 @@ static int cldu_open_x_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	int rc;
 
 	if (errc != CLE_OK) {
-		if (errc == CLE_INODE_INVAL) {
-			applog(LOG_ERR, "CLD open(%s) failed: %d, retrying",
-			       sp->xfname, errc);
-			evtimer_add(&sp->tm, &cldu_retry_delay);
+		if (errc == CLE_INODE_INVAL || errc == CLE_NAME_INVAL) {
+			applog(LOG_ERR, "%s: open failed, retrying",
+			       sp->xfname);
+			evtimer_add(&sp->tm_retry, &cldu_retry_delay);
 		} else {
 			applog(LOG_ERR, "CLD open(%s) failed: %d",
 			       sp->xfname, errc);
+			/* XXX we're dead, why not exit(1) right away? */
 		}
 		return 0;
 	}
@@ -647,8 +687,12 @@ static int cldu_close_x_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	}
 
 	if (list_empty(&sp->chunks)) {
-		applog(LOG_INFO, "%s: No Chunk nodes found", sp->xfname);
-		try_open_x(sp);
+		applog(LOG_INFO, "%s: No Chunk nodes found, retrying",
+		       sp->xfname);
+		if (evtimer_add(&sp->tm_retry, &cldu_retry_delay) != 0) {
+			applog(LOG_ERR, "evtimer_add error %s",
+			       strerror(errno));
+		}
 	} else {
 		next_chunk(sp);
 	}
@@ -738,8 +782,7 @@ static int cldu_get_y_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	ptr = carg->u.get.buf;
 	len = carg->u.get.size;
 	if (debugging)
-		applog(LOG_DEBUG,
-		       "got %d bytes from %s\n", len, sp->yfname);
+		applog(LOG_DEBUG, "got %d bytes from %s", len, sp->yfname);
 	stor_parse(sp->yfname, ptr, len);
 
 close_and_next:
@@ -771,8 +814,31 @@ static int cldu_close_y_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	np = list_entry(sp->chunks.next, struct chunk_node, link);
 	list_del(&np->link);
 
-	if (!list_empty(&sp->chunks))
+	if (!list_empty(&sp->chunks)) {
 		next_chunk(sp);
+		return 0;
+	}
+
+	/*
+	 * No more chunks to consider in this cycle, we're all done.
+	 * Now, poke the dispatch about the possible changes in the
+	 * configuration of Chunk.
+	 *
+	 * It's possible that the CLD directories are full of all garbage,
+	 * but no useable Chunk servers. In that case, treat everything
+	 * like a usual retry.
+	 *
+	 * For the case of normal operation, we also set up a rescan, for now.
+	 * In the future, we'll subscribe for change notification. FIXME.
+	 */
+	if (stor_update_cb()) {
+		evtimer_add(&sp->tm_rescan, &cldu_rescan_delay);
+	} else {
+		if (evtimer_add(&sp->tm_retry, &cldu_retry_delay) != 0) {
+			applog(LOG_ERR, "evtimer_add error %s",
+			       strerror(errno));
+		}
+	}
 	return 0;
 }
 
@@ -836,7 +902,9 @@ int cld_begin(const char *thishost, const char *thiscell)
 {
 	static struct cld_session *sp = &ses;
 
-	evtimer_set(&ses.tm, cldu_timer, &ses);
+	evtimer_set(&ses.tm_retry, cldu_tm_retry, &ses);
+	evtimer_set(&ses.tm_rescan, cldu_tm_rescan, &ses);
+	evtimer_set(&ses.tm_reopen, cldu_tm_reopen, &ses);
 
 	if (cldu_setcell(sp, thiscell, thishost)) {
 		/* Already logged error */
@@ -935,7 +1003,9 @@ void cld_end(void)
 		}
 	}
 
-	evtimer_del(&sp->tm);
+	evtimer_del(&sp->tm_retry);
+	evtimer_del(&sp->tm_rescan);
+	evtimer_del(&sp->tm_reopen);
 
 	free(sp->cfname);
 	sp->cfname = NULL;
diff --git a/server/server.c b/server/server.c
index 0398a77..dfe9287 100644
--- a/server/server.c
+++ b/server/server.c
@@ -1375,20 +1375,32 @@ static void tdb_state_cb(enum db_event event)
  * with the interface to Chunk and little more.
  *
  * We don't even bother with registering this callback, just call it by name. 
+ *
+ * The return value is used to re-arm rescan mechanism.
  */
-void stor_update_cb(void)
+int stor_update_cb(void)
 {
+	int num_up;
+	struct storage_node *stn;
 	unsigned int env_flags;
 
 	if (debugging)
-		applog(LOG_DEBUG, "We now have %d storage node(s)",
+		applog(LOG_DEBUG, "Know of potential %d storage node(s)",
 		       tabled_srv.num_stor);
-	if (tabled_srv.num_stor < 1) {
-		/*
-		 * FIXME In the future, initiate net_close here if net was up.
-		 */
-		return;
+	num_up = 0;
+	list_for_each_entry(stn, &tabled_srv.all_stor, all_link) {
+		if (stor_node_check(stn) == 0) {
+			if (debugging)
+				applog(LOG_DEBUG, " NID %u is up", stn->id);
+			num_up++;
+		}
+	}
+	if (num_up < 1) {
+		applog(LOG_INFO, "No active storage node(s), waiting");
+		return num_up;
 	}
+	if (debugging)
+		applog(LOG_DEBUG, "Active storage node(s): %d", stn->id);
 
 	/*
 	 * We initiate operations even if there's no redundancy in order
@@ -1412,6 +1424,7 @@ void stor_update_cb(void)
 		 */
 		;
 	}
+	return num_up;
 }
 
 static int net_open(void)
@@ -1652,8 +1665,6 @@ int main (int argc, char *argv[])
 	event_init();
 	evtimer_set(&tabled_srv.chkpt_timer, tdb_checkpoint, NULL);
 
-	stor_init();
-
 	/* set up server networking */
 	rc = net_open();
 	if (rc)
diff --git a/server/storage.c b/server/storage.c
index 380f8b0..b2160ac 100644
--- a/server/storage.c
+++ b/server/storage.c
@@ -379,32 +379,14 @@ void stor_add_node(uint32_t nid, const char *hostname, const char *portstr,
 	return;
 }
 
-/*
- * Wait for chunkd instances to come up, in case we start simultaneously
- * from a "make check" or a parallel boot script on the same computer,
- * of if a datacenter is being brought up.
- */
-void stor_init(void)
+/* Return 0 if the node checks out ok */
+int stor_node_check(struct storage_node *stn)
 {
 	struct st_client *stc;
-	struct storage_node *stn;
 	char host[41];
 	char port[6];
 	int rc;
 
-	/*
-	 * Just grab the first one for now, until the redundancy gets done.
-	 */
-	if (list_empty(&tabled_srv.all_stor)) {
-		/*
-		 * Maybe we should wait until more of them come online?
-		 */
-		applog(LOG_ERR, "No chunkd nodes, impossible to continue");
-		exit(1);
-	}
-	stn = list_entry(tabled_srv.all_stor.next,
-			 struct storage_node, all_link);
-
 	rc = stor_new_stc(stn, &stc);
 	if (rc < 0) {
 		if (rc == -EINVAL) {
@@ -418,16 +400,13 @@ void stor_init(void)
 			} else {
 				applog(LOG_INFO, "Error connecting to chunkd");
 			}
-			exit(1);
+		} else {
+			applog(LOG_INFO, "Error %d connecting to chunkd", rc);
 		}
-		applog(LOG_INFO, "Error connecting to chunkd, retrying");
-
-		/*
-		 * Logged the condition, now start looping silently.
-		 */
-		while (stor_new_stc(stn, &stc) < 0) sleep(3);
+		return -1;
 	}
 
 	stc_free(stc);
+	return 0;
 }
 
diff --git a/server/storparse.c b/server/storparse.c
index 680b047..eb40dcd 100644
--- a/server/storparse.c
+++ b/server/storparse.c
@@ -390,7 +390,6 @@ void stor_parse(char *fname, const char *text, size_t len)
 		applog(LOG_DEBUG, "Adding NID %u host %s port %s",
 		       ctx.nid, ctx.stor_ok_host, ctx.stor_ok_port);
 	stor_add_node(ctx.nid, ctx.stor_ok_host, ctx.stor_ok_port, &ctx.loc);
-	stor_update_cb();
 
 out_free_all:
 	free(ctx.text);
diff --git a/server/tabled.h b/server/tabled.h
index 918425a..2e3b52f 100644
--- a/server/tabled.h
+++ b/server/tabled.h
@@ -300,7 +300,7 @@ extern bool cli_cb_free(struct client *cli, struct client_write *wr,
 extern bool cli_write_start(struct client *cli);
 extern int cli_req_avail(struct client *cli);
 extern void applog(int prio, const char *fmt, ...);
-extern void stor_update_cb(void);
+extern int stor_update_cb(void);
 
 /* config.c */
 extern void read_config(void);
@@ -320,7 +320,7 @@ extern int stor_obj_del(struct storage_node *stn, uint64_t key);
 extern bool stor_obj_test(struct open_chunk *cep, uint64_t key);
 extern void stor_add_node(uint32_t nid, const char *hostname,
 			  const char *portstr, struct geo *locp);
-extern void stor_init(void);
+extern int stor_node_check(struct storage_node *stn);
 
 /* storparse.c */
 extern void stor_parse(char *fname, const char *text, size_t len);
diff --git a/test/start-daemon b/test/start-daemon
index 203e392..efe0548 100755
--- a/test/start-daemon
+++ b/test/start-daemon
@@ -19,12 +19,12 @@ fi
 # May be different on Solaris... like /usr/libexec or such.
 cld -d data/cld -P cld.pid -p 18081 -E
 chunkd -C $top_srcdir/test/chunkd-test.conf -E
-
-# 3 is enough, but we like to let chunkd to come up early and register with CLD
-sleep 7
-
 ../server/tabled -C $top_srcdir/test/tabled-test.conf -E -D
 
+# We have a special client that waits for tabled to come online, but before
+# that we also have tests that verify that the daemons started and wrote
+# their PID files. So, we need to give the above daemons a chance to progress
+# at least that far.
 sleep 3
 
 exit 0
diff --git a/test/wait-for-listen.c b/test/wait-for-listen.c
index 8ba190f..cb30183 100644
--- a/test/wait-for-listen.c
+++ b/test/wait-for-listen.c
@@ -80,9 +80,11 @@ int main()
 	for (;;) {
 		/*
  		 * Vote in DB4 replication takes about 12-13s.
- 		 */
-		if (time(NULL) >= start_time + 20) {
-			fprintf(stderr, "server is not up after 20 s\n");
+		 * In addition we may have retries when tabled polls for
+		 * Chunk daemons to come up. On busy boxes we may miss 20s.
+		 */
+		if (time(NULL) >= start_time + 25) {
+			fprintf(stderr, "server is not up after 25 s\n");
 			exit(1);
 		}
 
--
To unsubscribe from this list: send the line "unsubscribe hail-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html