When the checker thread enqueues paths for the io_err_stat thread to check, it calls enqueue_io_err_stat_by_path() with the vecs lock held. start_io_err_stat_thread() is also called with the vecs lock held. These two functions both lock io_err_pathvec_lock. When the io_err_stat thread updates the paths in vecs->pathvec in poll_io_err_stat(), it has the io_err_pathvec_lock held, and then locks the vecs lock. This can cause an ABBA deadlock. To solve this, service_paths() no longer updates the paths in vecs->pathvec with the io_err_pathvec_lock held. It does this by moving the io_err_stat_path from io_err_pathvec to a local vector when it needs to update the path. After releasing the io_err_pathvec_lock, it goes through this temporary vector, updates the paths with the vecs lock held, and then frees everything. This change fixes a bug in service_paths() where elements were being deleted from io_err_pathvec, without the index being decremented, causing the loop to skip elements. Also, service_paths() could be cancelled while holding the io_err_pathvec_lock, so it should have a cleanup handler. Signed-off-by: Benjamin Marzinski <bmarzins@xxxxxxxxxx> --- libmultipath/io_err_stat.c | 55 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/libmultipath/io_err_stat.c b/libmultipath/io_err_stat.c index 4c6f7f08..a222594e 100644 --- a/libmultipath/io_err_stat.c +++ b/libmultipath/io_err_stat.c @@ -385,20 +385,6 @@ recover: return 0; } -static int delete_io_err_stat_by_addr(struct io_err_stat_path *p) -{ - int i; - - i = find_slot(io_err_pathvec, p); - if (i != -1) - vector_del_slot(io_err_pathvec, i); - - destroy_directio_ctx(p); - free_io_err_stat_path(p); - - return 0; -} - static void account_async_io_state(struct io_err_stat_path *pp, int rc) { switch (rc) { @@ -415,17 +401,26 @@ static void account_async_io_state(struct io_err_stat_path *pp, int rc) } } -static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp) +static int io_err_stat_time_up(struct io_err_stat_path *pp) { struct timespec currtime, difftime; - struct path *path; - double err_rate; if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0) - return 1; + return 0; timespecsub(&currtime, &pp->start_time, &difftime); if (difftime.tv_sec < pp->total_time) return 0; + return 1; +} + +static void end_io_err_stat(struct io_err_stat_path *pp) +{ + struct timespec currtime; + struct path *path; + double err_rate; + + if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0) + currtime = pp->start_time; io_err_stat_log(4, "%s: check end", pp->devname); @@ -464,10 +459,6 @@ static int poll_io_err_stat(struct vectors *vecs, struct io_err_stat_path *pp) pp->devname); } lock_cleanup_pop(vecs->lock); - - delete_io_err_stat_by_addr(pp); - - return 0; } static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev) @@ -639,17 +630,33 @@ static void process_async_ios_event(int timeout_nsecs, char *dev) static void service_paths(void) { + struct _vector _pathvec = {0}; + /* avoid gcc warnings that &_pathvec will never be NULL in vector ops */ + vector tmp_pathvec = &_pathvec; struct io_err_stat_path *pp; int i; pthread_mutex_lock(&io_err_pathvec_lock); + pthread_cleanup_push(cleanup_unlock, &io_err_pathvec_lock); vector_foreach_slot(io_err_pathvec, pp, i) { send_batch_async_ios(pp); process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp->devname); poll_async_io_timeout(); - poll_io_err_stat(vecs, pp); + if (io_err_stat_time_up(pp)) { + if (!vector_alloc_slot(tmp_pathvec)) + continue; + vector_del_slot(io_err_pathvec, i--); + vector_set_slot(tmp_pathvec, pp); + } } - pthread_mutex_unlock(&io_err_pathvec_lock); + pthread_cleanup_pop(1); + vector_foreach_slot_backwards(tmp_pathvec, pp, i) { + end_io_err_stat(pp); + vector_del_slot(tmp_pathvec, i); + destroy_directio_ctx(pp); + free_io_err_stat_path(pp); + } + vector_reset(tmp_pathvec); } static void cleanup_exited(__attribute__((unused)) void *arg) -- 2.17.2 -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel