commit 38e47f046d4d3f9657e9644417ad8b154b46a6e7 Author: Jeff Garzik <jeff@xxxxxxxxxx> Date: Fri Mar 5 08:03:48 2010 -0500 Self-check cleanups. - remove self-check period. self-check period is externally controlled. - self-check initiation limited to specified list of administrative users - clear up confusion about check "start" versus "poke." "start" is preferred. - thread is not spawned until first check - add some whitespace for readability - Unrelated: test/server-test.cfg: s/spaces/tab/ in <CLD> clause Signed-off-by: Jeff Garzik <jgarzik@xxxxxxxxxx> diff --git a/doc/setup.txt b/doc/setup.txt index f8d9f00..a5e7c66 100644 --- a/doc/setup.txt +++ b/doc/setup.txt @@ -104,15 +104,12 @@ _cld._udp.phx2.ex.com has SRV record 10 50 8081 maika.phx2.ex.com. <Rack>F3R18</Rack> </Geo> -*) configure the self-check period, if desired. Note that the actual amount - of time that the daemon sleeps between self-check sweeps is randomized - in order to prevent the convoy effect, so the value below is approximate. +*) configure the list of users authorized to initiate background self-check - <!-- 30 * 60 * 60 == 108000 seconds --> - <SelfCheckPeriod>108000</SelfCheckPeriod> - - The default is zero, which disables the periodic self-check. It may be - appropriate if the application performs its own end-to-end checking. + <Check> + <User>admin_user</User> + <User>another_user</User> + </Check> *) start daemon (it will put itself into the background) with the configuration file just created: diff --git a/include/chunkc.h b/include/chunkc.h index 5ebc936..683992e 100644 --- a/include/chunkc.h +++ b/include/chunkc.h @@ -89,7 +89,7 @@ extern bool stc_put_inline(struct st_client *stc, const void *key, extern bool stc_del(struct st_client *stc, const void *key, size_t key_len); extern bool stc_ping(struct st_client *stc); -extern bool stc_check_poke(struct st_client *stc); +extern bool stc_check_start(struct st_client *stc); extern bool stc_check_status(struct st_client *stc, struct chunk_check_status *out); diff --git a/lib/chunkdc.c b/lib/chunkdc.c index 93783bf..e9e0e0f 100644 --- a/lib/chunkdc.c +++ b/lib/chunkdc.c @@ -978,13 +978,13 @@ bool stc_ping(struct st_client *stc) return true; } -bool stc_check_poke(struct st_client *stc) +bool stc_check_start(struct st_client *stc) { struct chunksrv_resp resp; struct chunksrv_req *req = (struct chunksrv_req *) stc->req_buf; if (stc->verbose) - fprintf(stderr, "libstc: CHECK_POKE\n"); + fprintf(stderr, "libstc: CHECK_START\n"); /* initialize request */ req_init(stc, req); @@ -1004,7 +1004,7 @@ bool stc_check_poke(struct st_client *stc) /* check response code */ if (resp.resp_code != che_Success) { if (stc->verbose) - fprintf(stderr, "CHECK_POKE resp code: %d\n", + fprintf(stderr, "CHECK_START resp code: %d\n", resp.resp_code); return false; } diff --git a/server/chunkd.h b/server/chunkd.h index 41e0a4b..42bcb21 100644 --- a/server/chunkd.h +++ b/server/chunkd.h @@ -223,8 +223,9 @@ struct server { char *group; uint32_t nid; struct geo loc; - time_t chk_period; + int chk_pipe[2]; + GList *chk_users; TCHDB *tbl_master; struct objcache actives; @@ -342,8 +343,7 @@ extern void resp_init_req(struct chunksrv_resp *resp, extern void read_config(void); /* selfcheck.c */ -extern void chk_init(void); -extern int chk_spawn(time_t period, TCHDB *hdb); +extern int chk_spawn(TCHDB *hdb); static inline bool use_sendfile(struct client *cli) { diff --git a/server/config.c b/server/config.c index fee2c0c..69b5e9b 100644 --- a/server/config.c +++ b/server/config.c @@ -41,6 +41,7 @@ struct config_context { bool badnid; bool in_ssl; bool in_listen; + bool in_chk; bool have_ssl; char *vol_path; @@ -133,6 +134,13 @@ static void cfg_elm_start (GMarkupParseContext *context, applog(LOG_ERR, "Nested CLD in configuration"); } } + else if (!strcmp(element_name, "Check")) { + if (!cc->in_chk) { + cc->in_chk = true; + } else { + applog(LOG_ERR, "Nested Check in configuration"); + } + } } static void cfg_elm_end_listen(struct config_context *cc) @@ -293,7 +301,16 @@ static void cfg_elm_end (GMarkupParseContext *context, else if (!strcmp(element_name, "SSL")) cc->in_ssl = false; - else if (cc->in_ssl && cc->text && !strcmp(element_name, "PrivateKey")) { + else if (cc->in_chk && cc->text && !strcmp(element_name, "User")) { + chunkd_srv.chk_users = + g_list_append(chunkd_srv.chk_users, cc->text); + cc->text = NULL; + } + + else if (!strcmp(element_name, "Check")) + cc->in_chk = false; + + else if (cc->in_ssl && cc->text && !strcmp(element_name, "PrivateKey")){ if (SSL_CTX_use_PrivateKey_file(ssl_ctx, cc->text, SSL_FILETYPE_PEM) <= 0) applog(LOG_ERR, "Failed to read SSL private key '%s'", @@ -442,24 +459,6 @@ static void cfg_elm_end (GMarkupParseContext *context, cc->text = NULL; } - else if (!strcmp(element_name, "SelfCheckPeriod")) { - if (!cc->text) { - applog(LOG_WARNING, "SelfCheckPeriod element empty"); - return; - } - n = strtol(cc->text, NULL, 10); - if (n < 0 || n >= LONG_MAX) { - applog(LOG_ERR, "SelfCheckPeriod '%s' is invalid", - cc->text); - free(cc->text); - cc->text = NULL; - return; - } - chunkd_srv.chk_period = n; - free(cc->text); - cc->text = NULL; - } - else { applog(LOG_WARNING, "Unknown element \"%s\"", element_name); } diff --git a/server/selfcheck.c b/server/selfcheck.c index f8cbaef..5207e7f 100644 --- a/server/selfcheck.c +++ b/server/selfcheck.c @@ -13,7 +13,6 @@ #include "chunkd.h" struct chk_arg { - time_t period; TCHDB *hdb; // GThread *gthread; }; @@ -183,62 +182,46 @@ static void chk_thread_command(struct chk_tls *tls) static gpointer chk_thread_func(gpointer data) { - struct chk_tls _tls; - struct chk_tls *tls; + struct chk_tls _tls = { .arg = data }; + struct chk_tls *tls = &_tls; struct pollfd pfd[1]; - int tmo; - time_t dt; int i; int rc; - tls = &_tls; - memset(tls, 0, sizeof(struct chk_tls)); - tls->arg = data; - for (;;) { g_mutex_lock(chunkd_srv.bigmutex); chunkd_srv.chk_state = CHK_ST_IDLE; g_mutex_unlock(chunkd_srv.bigmutex); - /* - * Without the randomization, all systems in a low loaded - * datacenter are virtually guaranteed to start checks in the - * same time, blow fuses and/or disrupt applications. - */ - dt = tls->arg->period; - if (dt == 0) { - tmo = -1; - } else { - if (dt >= 3) - dt += rand() % (dt/3); - if (debugging) - applog(LOG_DEBUG, "chk: sleeping %lu s", dt); - tmo = dt * 1000; - } - memset(pfd, 0, sizeof(pfd)); pfd[0].fd = chunkd_srv.chk_pipe[0]; pfd[0].events = POLLIN; - rc = poll(pfd, 1, tmo); + rc = poll(pfd, ARRAY_SIZE(pfd), -1); if (rc < 0) { applog(LOG_WARNING, "chk: poll error: %s", strerror(errno)); break; /* don't flood, just die */ } - if (rc) { - for (i = 0; i < ARRAY_SIZE(pfd); i++) { - if (pfd[i].revents) { - if (i == 0) { - chk_thread_command(tls); - } - } + if (rc == 0) + continue; /* should never happen */ + + for (i = 0; i < ARRAY_SIZE(pfd); i++) { + if (!pfd[i].revents) + continue; + + switch (i) { + case 0: + chk_thread_command(tls); + break; + default: + /* do nothing */ + break; } - } else { - chk_thread_scan(tls); } } + return NULL; } @@ -249,7 +232,7 @@ static gpointer chk_thread_func(gpointer data) */ static struct chk_arg *thread; -int chk_spawn(time_t period, TCHDB *hdb) +int chk_spawn(TCHDB *hdb) { GThread *gthread; struct chk_arg *arg; @@ -260,7 +243,6 @@ int chk_spawn(time_t period, TCHDB *hdb) applog(LOG_ERR, "No core"); return -1; } - arg->period = period; arg->hdb = hdb; gthread = g_thread_create(chk_thread_func, arg, FALSE, &error); @@ -276,18 +258,3 @@ int chk_spawn(time_t period, TCHDB *hdb) return 0; } -void chk_init() -{ - int rc; - - if (!chunkd_srv.chk_period) { - if (debugging) - applog(LOG_DEBUG, "chk: disabled by configuration"); - return; - } - - chunkd_srv.chk_state = CHK_ST_INIT; - rc = chk_spawn(chunkd_srv.chk_period, chunkd_srv.tbl_master); - if (rc) - exit(1); /* message already logged */ -} diff --git a/server/server.c b/server/server.c index 1dd9d8f..0d83311 100644 --- a/server/server.c +++ b/server/server.c @@ -933,24 +933,47 @@ err_out: return cli_err(cli, err, false); } -static bool chk_poke(struct client *cli) +static bool chk_user_authorized(struct client *cli) +{ + GList *tmp = chunkd_srv.chk_users; + + while (tmp) { + char *s; + + s = tmp->data; + if (!strcmp(cli->user, s)) + return true; + + tmp = tmp->next; + } + + return false; +} + +static bool chk_start(struct client *cli) { unsigned char cmd; int rc; + if (!chk_user_authorized(cli)) + return cli_err(cli, che_AccessDenied, true); + g_mutex_lock(chunkd_srv.bigmutex); + switch (chunkd_srv.chk_state) { case CHK_ST_OFF: chunkd_srv.chk_state = CHK_ST_INIT; g_mutex_unlock(chunkd_srv.bigmutex); - rc = chk_spawn(chunkd_srv.chk_period, chunkd_srv.tbl_master); + rc = chk_spawn(chunkd_srv.tbl_master); if (rc) return cli_err(cli, che_InternalError, true); break; + case CHK_ST_INIT: case CHK_ST_RUNNING: g_mutex_unlock(chunkd_srv.bigmutex); return cli_err(cli, che_Busy, true); + default: chunkd_srv.chk_state = CHK_ST_RUNNING; g_mutex_unlock(chunkd_srv.bigmutex); @@ -966,8 +989,11 @@ static bool chk_status(struct client *cli) struct chunk_check_status outbuf; memset(&outbuf, 0, sizeof(struct chunk_check_status)); + g_mutex_lock(chunkd_srv.bigmutex); + outbuf.lastdone = cpu_to_le64(chunkd_srv.chk_done); + switch (chunkd_srv.chk_state) { case CHK_ST_IDLE: outbuf.state = chk_Idle; @@ -979,6 +1005,7 @@ static bool chk_status(struct client *cli) default: outbuf.state = chk_Off; } + g_mutex_unlock(chunkd_srv.bigmutex); return cli_resp_bin(cli, &outbuf, sizeof(struct chunk_check_status)); @@ -1110,7 +1137,7 @@ static bool cli_evt_exec_req(struct client *cli, unsigned int events) rcb = volume_open(cli); break; case CHO_CHECK_START: - rcb = chk_poke(cli); + rcb = chk_start(cli); break; case CHO_CHECK_STATUS: rcb = chk_status(cli); @@ -1804,8 +1831,6 @@ int main (int argc, char *argv[]) goto err_out_cld; } - chk_init(); - /* set up server networking */ for (tmpl = chunkd_srv.listeners; tmpl; tmpl = tmpl->next) { rc = net_open(tmpl->data); diff --git a/test/selfcheck-unit.c b/test/selfcheck-unit.c index fbd9a09..ead5568 100644 --- a/test/selfcheck-unit.c +++ b/test/selfcheck-unit.c @@ -37,7 +37,6 @@ struct config_context { char *text; char *path; - long period; }; static void cfg_elm_start(GMarkupParseContext *context, @@ -56,20 +55,12 @@ static void cfg_elm_end(GMarkupParseContext *context, GError **error) { struct config_context *cc = user_data; - long n; if (!strcmp(element_name, "Path")) { OK(cc->text); free(cc->path); cc->path = cc->text; cc->text = NULL; - } else if (!strcmp(element_name, "SelfCheckPeriod")) { - OK(cc->text); - n = strtol(cc->text, NULL, 10); - OK(n >= 0 && n < LONG_MAX); - cc->period = n; - free(cc->text); - cc->text = NULL; } else { free(cc->text); cc->text = NULL; @@ -251,7 +242,6 @@ int main(int argc, char *argv[]) memset(&ctx, 0, sizeof(struct config_context)); read_config(&ctx); OK(ctx.path); /* must have a path */ - OK(!ctx.period); /* should have a SelfCheckPeriod set to off */ /* * Step 1: create the object @@ -275,11 +265,9 @@ int main(int argc, char *argv[]) OK(rcb); /* - * Step 3: damage the back-end file and wait - * We sleep for longer than the self-check period because: - * 1) the server sleeps for up to period*1.5 - * 2) the server may be quite busy walking numerous objects - * 3) the build system may be overloaded + * Step 3: damage the back-end file and wait, because: + * 1) the server may be quite busy walking numerous objects + * 2) the build system may be overloaded */ rcb = be_file_damage(fn); OK(rcb); @@ -291,7 +279,7 @@ int main(int argc, char *argv[]) OK(stc); rcb = stc_check_status(stc, &status1); OK(rcb); - rcb = stc_check_poke(stc); + rcb = stc_check_start(stc); OK(rcb); cnt = 0; for (;;) { diff --git a/test/server-test.cfg b/test/server-test.cfg index 80833d2..d98fca4 100644 --- a/test/server-test.cfg +++ b/test/server-test.cfg @@ -23,7 +23,12 @@ <NID>1</NID> <CLD> - <PortFile>cld.port</PortFile> - <Host>localhost</Host> + <PortFile>cld.port</PortFile> + <Host>localhost</Host> </CLD> +<Check> + <User>testuser</User> + <User>testuser2</User> +</Check> + diff --git a/tools/chcli.c b/tools/chcli.c index 24b3e4b..6fde35f 100644 --- a/tools/chcli.c +++ b/tools/chcli.c @@ -185,7 +185,7 @@ static void show_cmds(void) "DEL key Delete key\n" "PING Ping server\n" "CHECKSTATUS Fetch status of server self-check\n" -"CHECKPOKE Force server self-check\n" +"CHECKSTART Begin server self-check\n" "\n" "Keys provided on the command line (as opposed to via -k) are stored\n" "with a C-style nul terminating character appended, adding 1 byte to\n" @@ -639,7 +639,7 @@ static int cmd_get(void) return 0; } -static int cmd_check_poke(void) +static int cmd_check_start(void) { struct st_client *stc; @@ -647,8 +647,8 @@ static int cmd_check_poke(void) if (!stc) return 1; - if (!stc_check_poke(stc)) { - fprintf(stderr, "CHECK POKE failed\n"); + if (!stc_check_start(stc)) { + fprintf(stderr, "CHECK START failed\n"); stc_free(stc); return 1; } @@ -758,7 +758,7 @@ int main (int argc, char *argv[]) case CHC_CHECKSTATUS: return cmd_check_status(); case CHC_CHECKSTART: - return cmd_check_poke(); + return cmd_check_start(); } return 0; -- To unsubscribe from this list: send the line "unsubscribe hail-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html