[Patch 4/7] tabled: retry conflicting locks

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This problem was with us for a while, and even with this fix our start-up
is not reliable. But at least we will not be 100% guaranteed to hang as
before when restarting too quickly. So although the whole area needs some
serious reworking, this specific case was just too annoying to let it
continue.

Signed-Off-By: Pete Zaitcev <zaitcev@xxxxxxxxxx>

---
 server/cldu.c |   38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

commit fa910aacff5118664177f988029cc5f8e6ef886d
Author: Master <zaitcev@xxxxxxxxxxxxxxxxxx>
Date:   Thu Jan 14 19:56:13 2010 -0700

    Retry the lock conflict.

diff --git a/server/cldu.c b/server/cldu.c
index 273f149..1d61672 100644
--- a/server/cldu.c
+++ b/server/cldu.c
@@ -59,6 +59,7 @@ struct cld_session {
 	 * using sleep(), neither of the timers must ever be active simultane-
 	 * ously with any other. But using one timer structure is too annoying.
 	 */
+	struct event tm_relock;
 	struct event tm_retry;
 	struct event tm_rescan;
 	struct event tm_reopen;
@@ -85,6 +86,7 @@ static int cldu_set_cldc(struct cld_session *sp, int newactive);
 static int cldu_new_sess(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
+static void try_lock(struct cld_session *sp);
 static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_put_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
 static int cldu_get_1_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
@@ -99,6 +101,7 @@ static int cldu_close_y_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 static void add_remote(const char *name);
 static void add_chunk_node(struct cld_session *sp, const char *name);
 
+static struct timeval cldu_relock_delay = { 10, 0 };
 static struct timeval cldu_retry_delay = { 5, 0 };
 static struct timeval cldu_rescan_delay = { 50, 0 };
 static struct timeval cldu_reopen_delay = { 3, 0 };
@@ -168,6 +171,15 @@ err_oom:
 	return 0;
 }
 
+static void cldu_tm_relock(int fd, short events, void *userdata)
+{
+	struct cld_session *sp = userdata;
+
+	if (debugging)
+		applog(LOG_DEBUG, "Retrying locking of %s", sp->ffname);
+	try_lock(sp);
+}
+
 static void cldu_tm_retry(int fd, short events, void *userdata)
 {
 	struct cld_session *sp = userdata;
@@ -454,8 +466,6 @@ static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 {
 	struct cld_session *sp = carg->private;
-	struct cldc_call_opts copts;
-	int rc;
 
 	if (errc != CLE_OK) {
 		applog(LOG_ERR, "CLD open(%s) failed: %d", sp->ffname, errc);
@@ -473,6 +483,15 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	if (debugging)
 		applog(LOG_DEBUG, "CLD file \"%s\" created", sp->ffname);
 
+	try_lock(sp);
+	return 0;
+}
+
+static void try_lock(struct cld_session *sp)
+{
+	struct cldc_call_opts copts;
+	int rc;
+
 	/*
 	 * Lock the file, in case two hosts got the same hostname.
 	 */
@@ -483,8 +502,6 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 	if (rc) {
 		applog(LOG_ERR, "cldc_lock call error %d", rc);
 	}
-
-	return 0;
 }
 
 static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
@@ -497,6 +514,18 @@ static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
 
 	if (errc != CLE_OK) {
 		applog(LOG_ERR, "CLD lock(%s) failed: %d", sp->ffname, errc);
+		if (errc == CLE_LOCK_CONFLICT) {
+			/*
+			 * The usual reason why we get a lock conflict is
+			 * restarting too quickly and hitting the previous lock
+			 * that is going to disappear soon.
+			 *
+			 * FIXME: However, it may also be that a master
+			 * is ok and we should become a slave, e.g. start TDB.
+			 * We do not support multi-node, but we should.
+			 */
+			evtimer_add(&sp->tm_relock, &cldu_relock_delay);
+		}
 		return 0;
 	}
 
@@ -940,6 +969,7 @@ int cld_begin(const char *thishost, const char *thiscell)
 {
 	static struct cld_session *sp = &ses;
 
+	evtimer_set(&ses.tm_relock, cldu_tm_relock, &ses);
 	evtimer_set(&ses.tm_retry, cldu_tm_retry, &ses);
 	evtimer_set(&ses.tm_rescan, cldu_tm_rescan, &ses);
 	evtimer_set(&ses.tm_reopen, cldu_tm_reopen, &ses);
--
To unsubscribe from this list: send the line "unsubscribe hail-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Fedora Clound]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux