Patch "io-wq: remove GFP_ATOMIC allocation off schedule out path" has been added to the 5.14-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Sun, 12 Sep 2021 22:58:44 -0400

This is a note to let you know that I've just added the patch titled

    io-wq: remove GFP_ATOMIC allocation off schedule out path

to the 5.14-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     io-wq-remove-gfp_atomic-allocation-off-schedule-out-.patch
and it can be found in the queue-5.14 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 502f2173fc850e92cb19743e6b215030158b9f87
Author: Jens Axboe <axboe@xxxxxxxxx>
Date:   Wed Aug 4 08:37:25 2021 -0600

    io-wq: remove GFP_ATOMIC allocation off schedule out path
    
    [ Upstream commit d3e9f732c415cf22faa33d6f195e291ad82dc92e ]
    
    Daniel reports that the v5.14-rc4-rt4 kernel throws a BUG when running
    stress-ng:
    
    | [   90.202543] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:35
    | [   90.202549] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 2047, name: iou-wrk-2041
    | [   90.202555] CPU: 5 PID: 2047 Comm: iou-wrk-2041 Tainted: G        W         5.14.0-rc4-rt4+ #89
    | [   90.202559] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014
    | [   90.202561] Call Trace:
    | [   90.202577]  dump_stack_lvl+0x34/0x44
    | [   90.202584]  ___might_sleep.cold+0x87/0x94
    | [   90.202588]  rt_spin_lock+0x19/0x70
    | [   90.202593]  ___slab_alloc+0xcb/0x7d0
    | [   90.202598]  ? newidle_balance.constprop.0+0xf5/0x3b0
    | [   90.202603]  ? dequeue_entity+0xc3/0x290
    | [   90.202605]  ? io_wqe_dec_running.isra.0+0x98/0xe0
    | [   90.202610]  ? pick_next_task_fair+0xb9/0x330
    | [   90.202612]  ? __schedule+0x670/0x1410
    | [   90.202615]  ? io_wqe_dec_running.isra.0+0x98/0xe0
    | [   90.202618]  kmem_cache_alloc_trace+0x79/0x1f0
    | [   90.202621]  io_wqe_dec_running.isra.0+0x98/0xe0
    | [   90.202625]  io_wq_worker_sleeping+0x37/0x50
    | [   90.202628]  schedule+0x30/0xd0
    | [   90.202630]  schedule_timeout+0x8f/0x1a0
    | [   90.202634]  ? __bpf_trace_tick_stop+0x10/0x10
    | [   90.202637]  io_wqe_worker+0xfd/0x320
    | [   90.202641]  ? finish_task_switch.isra.0+0xd3/0x290
    | [   90.202644]  ? io_worker_handle_work+0x670/0x670
    | [   90.202646]  ? io_worker_handle_work+0x670/0x670
    | [   90.202649]  ret_from_fork+0x22/0x30
    
    which is due to the RT kernel not liking a GFP_ATOMIC allocation inside
    a raw spinlock. Besides that not working on RT, doing any kind of
    allocation from inside schedule() is kind of nasty and should be avoided
    if at all possible.
    
    This particular path happens when an io-wq worker goes to sleep, and we
    need a new worker to handle pending work. We currently allocate a small
    data item to hold the information we need to create a new worker, but we
    can instead include this data in the io_worker struct itself and just
    protect it with a single bit lock. We only really need one per worker
    anyway, as we will have run pending work between to sleep cycles.
    
    https://lore.kernel.org/lkml/20210804082418.fbibprcwtzyt5qax@xxxxxxxxxxxxx/
    Reported-by: Daniel Wagner <dwagner@xxxxxxx>
    Tested-by: Daniel Wagner <dwagner@xxxxxxx>
    Acked-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
    Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 7d2ed8c7dd31..4ce83bb48021 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -51,6 +51,10 @@ struct io_worker {
 
 	struct completion ref_done;
 
+	unsigned long create_state;
+	struct callback_head create_work;
+	int create_index;
+
 	struct rcu_head rcu;
 };
 
@@ -272,24 +276,18 @@ static void io_wqe_inc_running(struct io_worker *worker)
 	atomic_inc(&acct->nr_running);
 }
 
-struct create_worker_data {
-	struct callback_head work;
-	struct io_wqe *wqe;
-	int index;
-};
-
 static void create_worker_cb(struct callback_head *cb)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 	struct io_wq *wq;
 	struct io_wqe *wqe;
 	struct io_wqe_acct *acct;
 	bool do_create = false, first = false;
 
-	cwd = container_of(cb, struct create_worker_data, work);
-	wqe = cwd->wqe;
+	worker = container_of(cb, struct io_worker, create_work);
+	wqe = worker->wqe;
 	wq = wqe->wq;
-	acct = &wqe->acct[cwd->index];
+	acct = &wqe->acct[worker->create_index];
 	raw_spin_lock_irq(&wqe->lock);
 	if (acct->nr_workers < acct->max_workers) {
 		if (!acct->nr_workers)
@@ -299,33 +297,42 @@ static void create_worker_cb(struct callback_head *cb)
 	}
 	raw_spin_unlock_irq(&wqe->lock);
 	if (do_create) {
-		create_io_worker(wq, wqe, cwd->index, first);
+		create_io_worker(wq, wqe, worker->create_index, first);
 	} else {
 		atomic_dec(&acct->nr_running);
 		io_worker_ref_put(wq);
 	}
-	kfree(cwd);
+	clear_bit_unlock(0, &worker->create_state);
+	io_worker_release(worker);
 }
 
-static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct)
+static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
+				   struct io_wqe_acct *acct)
 {
-	struct create_worker_data *cwd;
 	struct io_wq *wq = wqe->wq;
 
 	/* raced with exit, just ignore create call */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
 		goto fail;
+	if (!io_worker_get(worker))
+		goto fail;
+	/*
+	 * create_state manages ownership of create_work/index. We should
+	 * only need one entry per worker, as the worker going to sleep
+	 * will trigger the condition, and waking will clear it once it
+	 * runs the task_work.
+	 */
+	if (test_bit(0, &worker->create_state) ||
+	    test_and_set_bit_lock(0, &worker->create_state))
+		goto fail_release;
 
-	cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC);
-	if (cwd) {
-		init_task_work(&cwd->work, create_worker_cb);
-		cwd->wqe = wqe;
-		cwd->index = acct->index;
-		if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL))
-			return;
-
-		kfree(cwd);
-	}
+	init_task_work(&worker->create_work, create_worker_cb);
+	worker->create_index = acct->index;
+	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
+		return;
+	clear_bit_unlock(0, &worker->create_state);
+fail_release:
+	io_worker_release(worker);
 fail:
 	atomic_dec(&acct->nr_running);
 	io_worker_ref_put(wq);
@@ -343,7 +350,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
 	if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
 		atomic_inc(&acct->nr_running);
 		atomic_inc(&wqe->wq->worker_refs);
-		io_queue_worker_create(wqe, acct);
+		io_queue_worker_create(wqe, worker, acct);
 	}
 }
 
@@ -1004,12 +1011,12 @@ err_wq:
 
 static bool io_task_work_match(struct callback_head *cb, void *data)
 {
-	struct create_worker_data *cwd;
+	struct io_worker *worker;
 
 	if (cb->func != create_worker_cb)
 		return false;
-	cwd = container_of(cb, struct create_worker_data, work);
-	return cwd->wqe->wq == data;
+	worker = container_of(cb, struct io_worker, create_work);
+	return worker->wqe->wq == data;
 }
 
 void io_wq_exit_start(struct io_wq *wq)
@@ -1026,12 +1033,13 @@ static void io_wq_exit_workers(struct io_wq *wq)
 		return;
 
 	while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
-		struct create_worker_data *cwd;
+		struct io_worker *worker;
 
-		cwd = container_of(cb, struct create_worker_data, work);
-		atomic_dec(&cwd->wqe->acct[cwd->index].nr_running);
+		worker = container_of(cb, struct io_worker, create_work);
+		atomic_dec(&worker->wqe->acct[worker->create_index].nr_running);
 		io_worker_ref_put(wq);
-		kfree(cwd);
+		clear_bit_unlock(0, &worker->create_state);
+		io_worker_release(worker);
 	}
 
 	rcu_read_lock();