On Tue, 2021-01-12 at 17:52 -0600, Benjamin Marzinski wrote: > The checker thread is reponsible for enqueueing paths for the > io_err_stat thread to check. During shutdown, the io_err_stat thread > is > shut down and cleaned up before the checker thread. There is no code > to make sure that the checker thread isn't accessing the io_err_stat > pathvec or its mutex while they are being freed, which can lead to > memory corruption crashes. > > To solve this, get rid of the io_err_stat_pathvec structure, and > statically define the mutex. This means that the mutex is always > valid > to access, and the io_err_stat pathvec can only be accessed while > holding it. If the io_err_stat thread has already been cleaned up > when the checker tries to access the pathvec, it will be NULL, and > the > checker will simply fail to enqueue the path. > > This change also fixes a bug in free_io_err_pathvec(), which > previously > only attempted to free the pathvec if it was not set, instead of when > it > was set. > > Signed-off-by: Benjamin Marzinski <bmarzins@xxxxxxxxxx> Looks good to me. A few minor notes below. Regards, Martin > --- > libmultipath/io_err_stat.c | 108 +++++++++++++++-------------------- > -- > 1 file changed, 44 insertions(+), 64 deletions(-) > > diff --git a/libmultipath/io_err_stat.c b/libmultipath/io_err_stat.c > index 2e48ee81..4c6f7f08 100644 > --- a/libmultipath/io_err_stat.c > +++ b/libmultipath/io_err_stat.c > @@ -46,12 +46,6 @@ > #define io_err_stat_log(prio, fmt, args...) \ > condlog(prio, "io error statistic: " fmt, ##args) > > - > -struct io_err_stat_pathvec { > - pthread_mutex_t mutex; > - vector pathvec; > -}; > - > struct dio_ctx { > struct timespec io_starttime; > unsigned int blksize; > @@ -75,9 +69,10 @@ static pthread_t io_err_stat_thr; > > static pthread_mutex_t io_err_thread_lock = > PTHREAD_MUTEX_INITIALIZER; > static pthread_cond_t io_err_thread_cond = PTHREAD_COND_INITIALIZER; > +static pthread_mutex_t io_err_pathvec_lock = > PTHREAD_MUTEX_INITIALIZER; > static int io_err_thread_running = 0; > > -static struct io_err_stat_pathvec *paths; > +static vector io_err_pathvec; > struct vectors *vecs; > io_context_t ioctx; > > @@ -207,46 +202,28 @@ static void free_io_err_stat_path(struct > io_err_stat_path *p) > FREE(p); > } > > -static struct io_err_stat_pathvec *alloc_pathvec(void) > +static void cleanup_unlock(void *arg) Nitpick: we've got the cleanup_mutex() utility function for this now. > { > - struct io_err_stat_pathvec *p; > - int r; > - > - p = (struct io_err_stat_pathvec *)MALLOC(sizeof(*p)); > - if (!p) > - return NULL; > - p->pathvec = vector_alloc(); > - if (!p->pathvec) > - goto out_free_struct_pathvec; > - r = pthread_mutex_init(&p->mutex, NULL); > - if (r) > - goto out_free_member_pathvec; > - > - return p; > - > -out_free_member_pathvec: > - vector_free(p->pathvec); > -out_free_struct_pathvec: > - FREE(p); > - return NULL; > + pthread_mutex_unlock((pthread_mutex_t*) arg); > } > > -static void free_io_err_pathvec(struct io_err_stat_pathvec *p) > +static void free_io_err_pathvec(void) > { > struct io_err_stat_path *path; > int i; > > - if (!p) > - return; > - pthread_mutex_destroy(&p->mutex); > - if (!p->pathvec) { > - vector_foreach_slot(p->pathvec, path, i) { > - destroy_directio_ctx(path); > - free_io_err_stat_path(path); Note: We should call destroy_directio_ctx() (only) from free_io_err_stat_path(). > - } > - vector_free(p->pathvec); > + pthread_mutex_lock(&io_err_pathvec_lock); > + pthread_cleanup_push(cleanup_unlock, &io_err_pathvec_lock); > + if (!io_err_pathvec) > + goto out; > + vector_foreach_slot(io_err_pathvec, path, i) { > + destroy_directio_ctx(path); > + free_io_err_stat_path(path); > } > - FREE(p); > + vector_free(io_err_pathvec); > + io_err_pathvec = NULL; > +out: > + pthread_cleanup_pop(1); > } > > /* > @@ -258,13 +235,13 @@ static int enqueue_io_err_stat_by_path(struct > path *path) > { > struct io_err_stat_path *p; > > - pthread_mutex_lock(&paths->mutex); > - p = find_err_path_by_dev(paths->pathvec, path->dev); > + pthread_mutex_lock(&io_err_pathvec_lock); > + p = find_err_path_by_dev(io_err_pathvec, path->dev); > if (p) { > - pthread_mutex_unlock(&paths->mutex); > + pthread_mutex_unlock(&io_err_pathvec_lock); > return 0; > } > - pthread_mutex_unlock(&paths->mutex); > + pthread_mutex_unlock(&io_err_pathvec_lock); > > p = alloc_io_err_stat_path(); > if (!p) > @@ -276,18 +253,18 @@ static int enqueue_io_err_stat_by_path(struct > path *path) > > if (setup_directio_ctx(p)) > goto free_ioerr_path; > - pthread_mutex_lock(&paths->mutex); > - if (!vector_alloc_slot(paths->pathvec)) > + pthread_mutex_lock(&io_err_pathvec_lock); > + if (!vector_alloc_slot(io_err_pathvec)) > goto unlock_destroy; > - vector_set_slot(paths->pathvec, p); > - pthread_mutex_unlock(&paths->mutex); > + vector_set_slot(io_err_pathvec, p); > + pthread_mutex_unlock(&io_err_pathvec_lock); > > io_err_stat_log(2, "%s: enqueue path %s to check", > path->mpp->alias, path->dev); Another note: This is not a level 2 log message. IMO the log levels of the io_err_stat code are generally too "low"; the only messages we want to see at 2 would be if a path's "marginal" status changes. Internals of the algorithm should log at level 3 and 4. > return 0; > > unlock_destroy: > - pthread_mutex_unlock(&paths->mutex); > + pthread_mutex_unlock(&io_err_pathvec_lock); > destroy_directio_ctx(p); > free_ioerr_path: > free_io_err_stat_path(p); > @@ -412,9 +389,9 @@ static int delete_io_err_stat_by_addr(struct > io_err_stat_path *p) > { > int i; > > - i = find_slot(paths->pathvec, p); > + i = find_slot(io_err_pathvec, p); > if (i != -1) > - vector_del_slot(paths->pathvec, i); > + vector_del_slot(io_err_pathvec, i); > > destroy_directio_ctx(p); > free_io_err_stat_path(p); > @@ -585,7 +562,7 @@ static void poll_async_io_timeout(void) > > if (clock_gettime(CLOCK_MONOTONIC, &curr_time) != 0) > return; > - vector_foreach_slot(paths->pathvec, pp, i) { > + vector_foreach_slot(io_err_pathvec, pp, i) { > for (j = 0; j < CONCUR_NR_EVENT; j++) { > rc = try_to_cancel_timeout_io(pp- > >dio_ctx_array + j, > &curr_time, pp->devname); > @@ -631,7 +608,7 @@ static void handle_async_io_done_event(struct > io_event *io_evt) > int rc = PATH_UNCHECKED; > int i, j; > > - vector_foreach_slot(paths->pathvec, pp, i) { > + vector_foreach_slot(io_err_pathvec, pp, i) { > for (j = 0; j < CONCUR_NR_EVENT; j++) { > ct = pp->dio_ctx_array + j; > if (&ct->io == io_evt->obj) { > @@ -665,19 +642,14 @@ static void service_paths(void) > struct io_err_stat_path *pp; > int i; > > - pthread_mutex_lock(&paths->mutex); > - vector_foreach_slot(paths->pathvec, pp, i) { > + pthread_mutex_lock(&io_err_pathvec_lock); > + vector_foreach_slot(io_err_pathvec, pp, i) { > send_batch_async_ios(pp); > process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp- > >devname); We should actually use pthread_cleanup_push() here (update: I see you changed this in patch 3/3). We should also call pthread_testcancel() before calling io_getevents(), which is not cancellation point but might block. > poll_async_io_timeout(); > poll_io_err_stat(vecs, pp); > } > - pthread_mutex_unlock(&paths->mutex); > -} > - > -static void cleanup_unlock(void *arg) > -{ > - pthread_mutex_unlock((pthread_mutex_t*) arg); > + pthread_mutex_unlock(&io_err_pathvec_lock); > } > > static void cleanup_exited(__attribute__((unused)) void *arg) > @@ -736,9 +708,14 @@ int start_io_err_stat_thread(void *data) > io_err_stat_log(4, "io_setup failed"); > return 1; > } > - paths = alloc_pathvec(); > - if (!paths) > + > + pthread_mutex_lock(&io_err_pathvec_lock); > + io_err_pathvec = vector_alloc(); > + if (!io_err_pathvec) { > + pthread_mutex_unlock(&io_err_pathvec_lock); > goto destroy_ctx; > + } > + pthread_mutex_unlock(&io_err_pathvec_lock); > > setup_thread_attr(&io_err_stat_attr, 32 * 1024, 0); > pthread_mutex_lock(&io_err_thread_lock); > @@ -763,7 +740,10 @@ int start_io_err_stat_thread(void *data) > return 0; > > out_free: > - free_io_err_pathvec(paths); > + pthread_mutex_lock(&io_err_pathvec_lock); > + vector_free(io_err_pathvec); > + io_err_pathvec = NULL; > + pthread_mutex_unlock(&io_err_pathvec_lock); > destroy_ctx: > io_destroy(ioctx); > io_err_stat_log(0, "failed to start io_error statistic > thread"); > @@ -779,6 +759,6 @@ void stop_io_err_stat_thread(void) > pthread_cancel(io_err_stat_thr); > > pthread_join(io_err_stat_thr, NULL); > - free_io_err_pathvec(paths); > + free_io_err_pathvec(); > io_destroy(ioctx); > } -- Dr. Martin Wilck <mwilck@xxxxxxxx>, Tel. +49 (0)911 74053 2107 SUSE Software Solutions Germany GmbH HRB 36809, AG Nürnberg GF: Felix Imendörffer -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel