On Tue, 2021-01-12 at 17:52 -0600, Benjamin Marzinski wrote: > When the checker thread enqueues paths for the io_err_stat thread to > check, it calls enqueue_io_err_stat_by_path() with the vecs lock > held. > start_io_err_stat_thread() is also called with the vecs lock held. > These two functions both lock io_err_pathvec_lock. When the > io_err_stat > thread updates the paths in vecs->pathvec in poll_io_err_stat(), it > has > the io_err_pathvec_lock held, and then locks the vecs lock. This can > cause an ABBA deadlock. > > To solve this, service_paths() no longer updates the paths in > vecs->pathvec with the io_err_pathvec_lock held. It does this by > moving > the io_err_stat_path from io_err_pathvec to a local vector when it > needs > to update the path. After releasing the io_err_pathvec_lock, it goes > through this temporary vector, updates the paths with the vecs lock > held, and then frees everything. > > This change fixes a bug in service_paths() where elements were being > deleted from io_err_pathvec, without the index being decremented, > causing the loop to skip elements. Also, service_paths() could be > cancelled while holding the io_err_pathvec_lock, so it should have a > cleanup handler. > > Signed-off-by: Benjamin Marzinski <bmarzins@xxxxxxxxxx> Looks good. Only two nits below. > --- > libmultipath/io_err_stat.c | 55 +++++++++++++++++++++--------------- > -- > 1 file changed, 31 insertions(+), 24 deletions(-) > > diff --git a/libmultipath/io_err_stat.c b/libmultipath/io_err_stat.c > index 4c6f7f08..a222594e 100644 > --- a/libmultipath/io_err_stat.c > +++ b/libmultipath/io_err_stat.c > @@ -385,20 +385,6 @@ recover: > return 0; > } > > -static int delete_io_err_stat_by_addr(struct io_err_stat_path *p) > -{ > - int i; > - > - i = find_slot(io_err_pathvec, p); > - if (i != -1) > - vector_del_slot(io_err_pathvec, i); > - > - destroy_directio_ctx(p); > - free_io_err_stat_path(p); > - > - return 0; > -} > - > static void account_async_io_state(struct io_err_stat_path *pp, int > rc) > { > switch (rc) { > @@ -415,17 +401,26 @@ static void account_async_io_state(struct > io_err_stat_path *pp, int rc) > } > } > > -static int poll_io_err_stat(struct vectors *vecs, struct > io_err_stat_path *pp) > +static int io_err_stat_time_up(struct io_err_stat_path *pp) > { > struct timespec currtime, difftime; > - struct path *path; > - double err_rate; > > if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0) > - return 1; > + return 0; This can't fail. Please change it to get_monotonic_time(). > timespecsub(&currtime, &pp->start_time, &difftime); > if (difftime.tv_sec < pp->total_time) > return 0; > + return 1; > +} > + > +static void end_io_err_stat(struct io_err_stat_path *pp) > +{ > + struct timespec currtime; > + struct path *path; > + double err_rate; > + > + if (clock_gettime(CLOCK_MONOTONIC, &currtime) != 0) > + currtime = pp->start_time; See above. > > io_err_stat_log(4, "%s: check end", pp->devname); > > @@ -464,10 +459,6 @@ static int poll_io_err_stat(struct vectors > *vecs, struct io_err_stat_path *pp) > pp->devname); > } > lock_cleanup_pop(vecs->lock); > - > - delete_io_err_stat_by_addr(pp); > - > - return 0; > } > > static int send_each_async_io(struct dio_ctx *ct, int fd, char *dev) > @@ -639,17 +630,33 @@ static void process_async_ios_event(int > timeout_nsecs, char *dev) > > static void service_paths(void) > { > + struct _vector _pathvec = {0}; > + /* avoid gcc warnings that &_pathvec will never be NULL in > vector ops */ > + vector tmp_pathvec = &_pathvec; > struct io_err_stat_path *pp; > int i; > > pthread_mutex_lock(&io_err_pathvec_lock); > + pthread_cleanup_push(cleanup_unlock, &io_err_pathvec_lock); > vector_foreach_slot(io_err_pathvec, pp, i) { > send_batch_async_ios(pp); > process_async_ios_event(TIMEOUT_NO_IO_NSEC, pp- > >devname); > poll_async_io_timeout(); > - poll_io_err_stat(vecs, pp); > + if (io_err_stat_time_up(pp)) { > + if (!vector_alloc_slot(tmp_pathvec)) > + continue; > + vector_del_slot(io_err_pathvec, i--); > + vector_set_slot(tmp_pathvec, pp); > + } > } > - pthread_mutex_unlock(&io_err_pathvec_lock); > + pthread_cleanup_pop(1); > + vector_foreach_slot_backwards(tmp_pathvec, pp, i) { > + end_io_err_stat(pp); > + vector_del_slot(tmp_pathvec, i); > + destroy_directio_ctx(pp); > + free_io_err_stat_path(pp); > + } > + vector_reset(tmp_pathvec); > } > > static void cleanup_exited(__attribute__((unused)) void *arg) -- Dr. Martin Wilck <mwilck@xxxxxxxx>, Tel. +49 (0)911 74053 2107 SUSE Software Solutions Germany GmbH HRB 36809, AG Nürnberg GF: Felix Imendörffer -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel