This problem was with us for a while, and even with this fix our start-up is not reliable. But at least we will not be 100% guaranteed to hang as before when restarting too quickly. So although the whole area needs some serious reworking, this specific case was just too annoying to let it continue. Signed-Off-By: Pete Zaitcev <zaitcev@xxxxxxxxxx> --- server/cldu.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) commit fa910aacff5118664177f988029cc5f8e6ef886d Author: Master <zaitcev@xxxxxxxxxxxxxxxxxx> Date: Thu Jan 14 19:56:13 2010 -0700 Retry the lock conflict. diff --git a/server/cldu.c b/server/cldu.c index 273f149..1d61672 100644 --- a/server/cldu.c +++ b/server/cldu.c @@ -59,6 +59,7 @@ struct cld_session { * using sleep(), neither of the timers must ever be active simultane- * ously with any other. But using one timer structure is too annoying. */ + struct event tm_relock; struct event tm_retry; struct event tm_rescan; struct event tm_reopen; @@ -85,6 +86,7 @@ static int cldu_set_cldc(struct cld_session *sp, int newactive); static int cldu_new_sess(struct cldc_call_opts *carg, enum cle_err_codes errc); static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc); static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc); +static void try_lock(struct cld_session *sp); static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc); static int cldu_put_cb(struct cldc_call_opts *carg, enum cle_err_codes errc); static int cldu_get_1_cb(struct cldc_call_opts *carg, enum cle_err_codes errc); @@ -99,6 +101,7 @@ static int cldu_close_y_cb(struct cldc_call_opts *carg, enum cle_err_codes errc) static void add_remote(const char *name); static void add_chunk_node(struct cld_session *sp, const char *name); +static struct timeval cldu_relock_delay = { 10, 0 }; static struct timeval cldu_retry_delay = { 5, 0 }; static struct timeval cldu_rescan_delay = { 50, 0 }; static struct timeval cldu_reopen_delay = { 3, 0 }; @@ -168,6 +171,15 @@ err_oom: return 0; } +static void cldu_tm_relock(int fd, short events, void *userdata) +{ + struct cld_session *sp = userdata; + + if (debugging) + applog(LOG_DEBUG, "Retrying locking of %s", sp->ffname); + try_lock(sp); +} + static void cldu_tm_retry(int fd, short events, void *userdata) { struct cld_session *sp = userdata; @@ -454,8 +466,6 @@ static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc) static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc) { struct cld_session *sp = carg->private; - struct cldc_call_opts copts; - int rc; if (errc != CLE_OK) { applog(LOG_ERR, "CLD open(%s) failed: %d", sp->ffname, errc); @@ -473,6 +483,15 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc) if (debugging) applog(LOG_DEBUG, "CLD file \"%s\" created", sp->ffname); + try_lock(sp); + return 0; +} + +static void try_lock(struct cld_session *sp) +{ + struct cldc_call_opts copts; + int rc; + /* * Lock the file, in case two hosts got the same hostname. */ @@ -483,8 +502,6 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc) if (rc) { applog(LOG_ERR, "cldc_lock call error %d", rc); } - - return 0; } static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc) @@ -497,6 +514,18 @@ static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc) if (errc != CLE_OK) { applog(LOG_ERR, "CLD lock(%s) failed: %d", sp->ffname, errc); + if (errc == CLE_LOCK_CONFLICT) { + /* + * The usual reason why we get a lock conflict is + * restarting too quickly and hitting the previous lock + * that is going to disappear soon. + * + * FIXME: However, it may also be that a master + * is ok and we should become a slave, e.g. start TDB. + * We do not support multi-node, but we should. + */ + evtimer_add(&sp->tm_relock, &cldu_relock_delay); + } return 0; } @@ -940,6 +969,7 @@ int cld_begin(const char *thishost, const char *thiscell) { static struct cld_session *sp = &ses; + evtimer_set(&ses.tm_relock, cldu_tm_relock, &ses); evtimer_set(&ses.tm_retry, cldu_tm_retry, &ses); evtimer_set(&ses.tm_rescan, cldu_tm_rescan, &ses); evtimer_set(&ses.tm_reopen, cldu_tm_reopen, &ses); -- To unsubscribe from this list: send the line "unsubscribe hail-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html