On 8/3/21 1:24 PM, Jens Axboe wrote: > On 8/3/21 1:20 PM, Nadav Amit wrote: >> >> >>> On Aug 3, 2021, at 11:14 AM, Jens Axboe <axboe@xxxxxxxxx> wrote: >>> >>> On 8/3/21 12:04 PM, Nadav Amit wrote: >>>> >>>> >>>> Thanks for the quick response. >>>> >>>> I tried you version. It works better, but my workload still gets stuck >>>> occasionally (less frequently though). It is pretty obvious that the >>>> version you sent still has a race, so I didn’t put the effort into >>>> debugging it. >>> >>> All good, thanks for testing! Is it a test case you can share? Would >>> help with confidence in the final solution. >> >> Unfortunately no, since it is an entire WIP project that I am working >> on (with undetermined license at this point). But I will be happy to >> test any solution that you provide. > > OK no worries, I'll see if I can tighten this up. I don't particularly > hate your solution, it would just be nice to avoid creating a new worker > if we can just keep running the current one. > > I'll toss something your way in a bit... How about this? I think this largely stems from the fact that we only do a partial running decrement on exit. Left the previous checks in place as well, as it will reduce the amount of times that we do need to hit that case. diff --git a/fs/io-wq.c b/fs/io-wq.c index cf086b01c6c6..f072995d382b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -35,12 +35,17 @@ enum { IO_WQE_FLAG_STALLED = 1, /* stalled on hash */ }; +enum { + IO_WORKER_EXITING = 0, /* worker is exiting */ +}; + /* * One for each thread in a wqe pool */ struct io_worker { refcount_t ref; unsigned flags; + unsigned long state; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; @@ -130,6 +135,7 @@ struct io_cb_cancel_data { }; static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); +static void io_wqe_dec_running(struct io_worker *worker); static bool io_worker_get(struct io_worker *worker) { @@ -168,26 +174,21 @@ static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = io_wqe_get_acct(worker); - unsigned flags; if (refcount_dec_and_test(&worker->ref)) complete(&worker->ref_done); wait_for_completion(&worker->ref_done); - preempt_disable(); - current->flags &= ~PF_IO_WORKER; - flags = worker->flags; - worker->flags = 0; - if (flags & IO_WORKER_F_RUNNING) - atomic_dec(&acct->nr_running); - worker->flags = 0; - preempt_enable(); - raw_spin_lock_irq(&wqe->lock); - if (flags & IO_WORKER_F_FREE) + if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); acct->nr_workers--; + preempt_disable(); + io_wqe_dec_running(worker); + worker->flags = 0; + current->flags &= ~PF_IO_WORKER; + preempt_enable(); raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); @@ -214,15 +215,20 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) struct hlist_nulls_node *n; struct io_worker *worker; - n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list)); - if (is_a_nulls(n)) - return false; - - worker = hlist_nulls_entry(n, struct io_worker, nulls_node); - if (io_worker_get(worker)) { - wake_up_process(worker->task); + /* + * Iterate free_list and see if we can find an idle worker to + * activate. If a given worker is on the free_list but in the process + * of exiting, keep trying. + */ + hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { + if (!io_worker_get(worker)) + continue; + if (!test_bit(IO_WORKER_EXITING, &worker->state)) { + wake_up_process(worker->task); + io_worker_release(worker); + return true; + } io_worker_release(worker); - return true; } return false; @@ -560,8 +566,17 @@ static int io_wqe_worker(void *data) if (ret) continue; /* timed out, exit unless we're the fixed worker */ - if (!(worker->flags & IO_WORKER_F_FIXED)) + if (!(worker->flags & IO_WORKER_F_FIXED)) { + /* + * Someone elevated our refs, which could be trying + * to re-activate for work. Loop one more time for + * that case. + */ + if (refcount_read(&worker->ref) != 1) + continue; + set_bit(IO_WORKER_EXITING, &worker->state); break; + } } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { -- Jens Axboe