Hi lixiaokeng, On Tue, 2021-03-02 at 16:29 +0100, Martin Wilck wrote: > On Tue, 2021-03-02 at 20:44 +0800, lixiaokeng wrote: > > > > > > The stacks you have shown indicate that the instruction pointers were > broken. That would suggest something similar as dicussed in the ML > thread leading to 38ffd89 ("libmultipath: prevent DSO unloading with > astray checker threads"). Your logs show "tur checker refcount 1", so > the next call to checker_put would have unloaded the DSO. > > Please try commenting out the dlclose() call in free_checker_class(), > and see if it makes a difference. I have two TENTATIVE patches here that I'd like you to ask to try (with the dlclose in place again). Also, please make sure you've got 38ffd89. This is really tentative, I'm still pretty much in the dark. But my theory is that the crash can happen if the thread is about to start. So the most important part is the hunk that checks the return value of checker_class_ref() in start_checker_thread(). Martin
From a4dd64808d49f5a0d2a94336e56401262ef99e55 Mon Sep 17 00:00:00 2001 From: Martin Wilck <mwilck@xxxxxxxx> Date: Tue, 2 Mar 2021 17:03:15 +0100 Subject: [PATCH 1/2] libmultipath: protect DSO unloading with RCU Some crashes possibly related to DSO unloading are still observed. Try protecting the unloading with RCU. Signed-off-by: Martin Wilck <mwilck@xxxxxxxx> --- libmultipath/checkers.c | 79 ++++++++++++++++++++++++++++++----------- libmultipath/propsel.c | 4 +++ 2 files changed, 63 insertions(+), 20 deletions(-) diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c index 2dd9915..25f07ce 100644 --- a/libmultipath/checkers.c +++ b/libmultipath/checkers.c @@ -3,6 +3,7 @@ #include <stddef.h> #include <dlfcn.h> #include <sys/stat.h> +#include <errno.h> #include <urcu.h> #include <urcu/uatomic.h> @@ -25,6 +26,7 @@ struct checker_class { void *(*thread)(void *); /* async thread entry point */ const char **msgtable; short msgtable_size; + struct rcu_head rcu; }; static const char *checker_state_names[PATH_MAX_STATE] = { @@ -74,20 +76,16 @@ static int checker_class_unref(struct checker_class *cls) return uatomic_sub_return(&cls->refcount, 1); } -void free_checker_class(struct checker_class *c) +static void free_checker_class_rcu(struct rcu_head *head) { - int cnt; + struct checker_class *c = container_of(head, struct checker_class, rcu); - if (!c) - return; - cnt = checker_class_unref(c); - if (cnt != 0) { - condlog(cnt < 0 ? 1 : 4, "%s checker refcount %d", - c->name, cnt); + if (uatomic_read(&c-refcount) > 0) { + condlog(1, "%s: RACE: refcount = %d, not freeing checker", + __func__, refcount); return; } condlog(3, "unloading %s checker", c->name); - list_del(&c->node); if (c->reset) c->reset(); if (c->handle) { @@ -99,6 +97,22 @@ void free_checker_class(struct checker_class *c) FREE(c); } +static void free_checker_class(struct checker_class *c) +{ + int cnt; + + if (!c) + return; + cnt = checker_class_unref(c); + if (cnt != 0) { + condlog(cnt < 0 ? 1 : 4, "%s checker refcount %d", + c->name, cnt); + return; + } + list_del(&c->node); + call_rcu(&c->rcu, free_checker_class_rcu); +} + void cleanup_checkers (void) { struct checker_class *checker_loop; @@ -111,15 +125,32 @@ void cleanup_checkers (void) static struct checker_class *checker_class_lookup(const char *name) { - struct checker_class *c; + struct checker_class *c, *found = NULL; + int refcount = 0; if (!name || !strlen(name)) return NULL; + + rcu_read_lock(); list_for_each_entry(c, &checkers, node) { - if (!strncmp(name, c->name, CHECKER_NAME_LEN)) - return c; + if (!strncmp(name, c->name, CHECKER_NAME_LEN)) { + found = c; + break; + } } - return NULL; + if (found) { + refcount = checker_class_ref(found); + if (refcount == 1) + checker_class_unref(found); + } + rcu_read_unlock(); + + if (refcount <= 1) { + condlog(1, "%s: RACE: got refcount == %d", __func__, refcount); + found = NULL; + } + + return found; } void reset_checker_classes(void) @@ -387,11 +418,20 @@ static void *checker_thread_entry(void *arg) int start_checker_thread(pthread_t *thread, const pthread_attr_t *attr, struct checker_context *ctx) { - int rv; + int rv, refcount; assert(ctx && ctx->cls && ctx->cls->thread); + /* Take a ref here, lest the class be freed before the thread starts */ - (void)checker_class_ref(ctx->cls); + rcu_read_lock(); + refcount = checker_class_ref(ctx->cls); + if (refcount <= 1) + checker_class_unref(ctx->cls); + rcu_read_unlock(); + if (refcount <= 1) + condlog(1, "%s: RACE: got refcount == %d", __func_, refcount); + return EIO; + } rv = pthread_create(thread, attr, checker_thread_entry, ctx); if (rv != 0) { condlog(1, "failed to start checker thread for %s: %m", @@ -418,14 +458,13 @@ void checker_get(const char *multipath_dir, struct checker *dst, if (name && strlen(name)) { src = checker_class_lookup(name); - if (!src) + if (!src) { src = add_checker_class(multipath_dir, name); + if (src && checker_class_ref(src) == 1) + src = NULL; + } } dst->cls = src; - if (!src) - return; - - (void)checker_class_ref(dst->cls); } int init_checkers(const char *multipath_dir) diff --git a/libmultipath/propsel.c b/libmultipath/propsel.c index f771a83..4add95a 100644 --- a/libmultipath/propsel.c +++ b/libmultipath/propsel.c @@ -536,6 +536,10 @@ int select_checker(struct config *conf, struct path *pp) do_default(ckr_name, DEFAULT_CHECKER); out: checker_get(conf->multipath_dir, c, ckr_name); + if (!checker_selected(c)) { + condlog(1, "%s: failed to grab checker", __func__); + return 1; + } condlog(3, "%s: path_checker = %s %s", pp->dev, checker_name(c), origin); if (conf->checker_timeout) { -- 2.29.2
From c44375bb5e218b1e54ca4d9069b2b1632df87f75 Mon Sep 17 00:00:00 2001 From: Martin Wilck <mwilck@xxxxxxxx> Date: Tue, 2 Mar 2021 17:05:26 +0100 Subject: [PATCH 2/2] libmultipath: tur_thread: use pthread_exit() Using "return" would jump into a different DSO (libmultipath), avoid that. Signed-off-by: Martin Wilck <mwilck@xxxxxxxx> --- libmultipath/checkers.c | 11 ++++++----- libmultipath/checkers/tur.c | 2 ++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c index 25f07ce..99e48bc 100644 --- a/libmultipath/checkers.c +++ b/libmultipath/checkers.c @@ -79,8 +79,9 @@ static int checker_class_unref(struct checker_class *cls) static void free_checker_class_rcu(struct rcu_head *head) { struct checker_class *c = container_of(head, struct checker_class, rcu); + int refcount; - if (uatomic_read(&c-refcount) > 0) { + if ((refcount = uatomic_read(&c->refcount)) > 0) { condlog(1, "%s: RACE: refcount = %d, not freeing checker", __func__, refcount); return; @@ -145,7 +146,7 @@ static struct checker_class *checker_class_lookup(const char *name) } rcu_read_unlock(); - if (refcount <= 1) { + if (refcount == 1) { condlog(1, "%s: RACE: got refcount == %d", __func__, refcount); found = NULL; } @@ -425,11 +426,11 @@ int start_checker_thread(pthread_t *thread, const pthread_attr_t *attr, /* Take a ref here, lest the class be freed before the thread starts */ rcu_read_lock(); refcount = checker_class_ref(ctx->cls); - if (refcount <= 1) + if (refcount == 1) checker_class_unref(ctx->cls); rcu_read_unlock(); - if (refcount <= 1) - condlog(1, "%s: RACE: got refcount == %d", __func_, refcount); + if (refcount <= 1) { + condlog(1, "%s: RACE: got refcount == %d", __func__, refcount); return EIO; } rv = pthread_create(thread, attr, checker_thread_entry, ctx); diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c index a4b4a21..0db50ba 100644 --- a/libmultipath/checkers/tur.c +++ b/libmultipath/checkers/tur.c @@ -284,6 +284,8 @@ void *libcheck_thread(struct checker_context *ctx) tur_thread_cleanup_pop(ct); + pthread_exit(NULL); + /* not reached */ return ((void *)0); } -- 2.29.2
-- dm-devel mailing list dm-devel@xxxxxxxxxx https://listman.redhat.com/mailman/listinfo/dm-devel