Re: Race between io_wqe_worker() and io_wqe_wake_worker()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/3/21 1:24 PM, Jens Axboe wrote:
> On 8/3/21 1:20 PM, Nadav Amit wrote:
>>
>>
>>> On Aug 3, 2021, at 11:14 AM, Jens Axboe <axboe@xxxxxxxxx> wrote:
>>>
>>> On 8/3/21 12:04 PM, Nadav Amit wrote:
>>>>
>>>>
>>>> Thanks for the quick response.
>>>>
>>>> I tried you version. It works better, but my workload still gets stuck
>>>> occasionally (less frequently though). It is pretty obvious that the
>>>> version you sent still has a race, so I didn’t put the effort into
>>>> debugging it.
>>>
>>> All good, thanks for testing! Is it a test case you can share? Would
>>> help with confidence in the final solution.
>>
>> Unfortunately no, since it is an entire WIP project that I am working
>> on (with undetermined license at this point). But I will be happy to
>> test any solution that you provide.
> 
> OK no worries, I'll see if I can tighten this up. I don't particularly
> hate your solution, it would just be nice to avoid creating a new worker
> if we can just keep running the current one.
> 
> I'll toss something your way in a bit...

How about this? I think this largely stems from the fact that we only
do a partial running decrement on exit. Left the previous checks in
place as well, as it will reduce the amount of times that we do need
to hit that case.


diff --git a/fs/io-wq.c b/fs/io-wq.c
index cf086b01c6c6..f072995d382b 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -35,12 +35,17 @@ enum {
 	IO_WQE_FLAG_STALLED	= 1,	/* stalled on hash */
 };
 
+enum {
+	IO_WORKER_EXITING	= 0,	/* worker is exiting */
+};
+
 /*
  * One for each thread in a wqe pool
  */
 struct io_worker {
 	refcount_t ref;
 	unsigned flags;
+	unsigned long state;
 	struct hlist_nulls_node nulls_node;
 	struct list_head all_list;
 	struct task_struct *task;
@@ -130,6 +135,7 @@ struct io_cb_cancel_data {
 };
 
 static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
+static void io_wqe_dec_running(struct io_worker *worker);
 
 static bool io_worker_get(struct io_worker *worker)
 {
@@ -168,26 +174,21 @@ static void io_worker_exit(struct io_worker *worker)
 {
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
-	unsigned flags;
 
 	if (refcount_dec_and_test(&worker->ref))
 		complete(&worker->ref_done);
 	wait_for_completion(&worker->ref_done);
 
-	preempt_disable();
-	current->flags &= ~PF_IO_WORKER;
-	flags = worker->flags;
-	worker->flags = 0;
-	if (flags & IO_WORKER_F_RUNNING)
-		atomic_dec(&acct->nr_running);
-	worker->flags = 0;
-	preempt_enable();
-
 	raw_spin_lock_irq(&wqe->lock);
-	if (flags & IO_WORKER_F_FREE)
+	if (worker->flags & IO_WORKER_F_FREE)
 		hlist_nulls_del_rcu(&worker->nulls_node);
 	list_del_rcu(&worker->all_list);
 	acct->nr_workers--;
+	preempt_disable();
+	io_wqe_dec_running(worker);
+	worker->flags = 0;
+	current->flags &= ~PF_IO_WORKER;
+	preempt_enable();
 	raw_spin_unlock_irq(&wqe->lock);
 
 	kfree_rcu(worker, rcu);
@@ -214,15 +215,20 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
 	struct hlist_nulls_node *n;
 	struct io_worker *worker;
 
-	n = rcu_dereference(hlist_nulls_first_rcu(&wqe->free_list));
-	if (is_a_nulls(n))
-		return false;
-
-	worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
-	if (io_worker_get(worker)) {
-		wake_up_process(worker->task);
+	/*
+	 * Iterate free_list and see if we can find an idle worker to
+	 * activate. If a given worker is on the free_list but in the process
+	 * of exiting, keep trying.
+	 */
+	hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
+		if (!io_worker_get(worker))
+			continue;
+		if (!test_bit(IO_WORKER_EXITING, &worker->state)) {
+			wake_up_process(worker->task);
+			io_worker_release(worker);
+			return true;
+		}
 		io_worker_release(worker);
-		return true;
 	}
 
 	return false;
@@ -560,8 +566,17 @@ static int io_wqe_worker(void *data)
 		if (ret)
 			continue;
 		/* timed out, exit unless we're the fixed worker */
-		if (!(worker->flags & IO_WORKER_F_FIXED))
+		if (!(worker->flags & IO_WORKER_F_FIXED)) {
+			/*
+			 * Someone elevated our refs, which could be trying
+			 * to re-activate for work. Loop one more time for
+			 * that case.
+			 */
+			if (refcount_read(&worker->ref) != 1)
+				continue;
+			set_bit(IO_WORKER_EXITING, &worker->state);
 			break;
+		}
 	}
 
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {

-- 
Jens Axboe




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux