Re: [PATCH 09/18] io-wq: fork worker threads from original task

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 3/4/21 12:46 PM, Linus Torvalds wrote:
> On Thu, Mar 4, 2021 at 11:19 AM Jens Axboe <axboe@xxxxxxxxx> wrote:
>>
>> Took a quick look at this, and I agree that's _much_ better. In fact, it
>> boils down to just calling copy_process() and then having the caller do
>> wake_up_new_task(). So not sure if it's worth adding an
>> create_io_thread() helper, or just make copy_process() available
>> instead. This is ignoring the trace point for now...
> 
> I really don't want to expose copy_process() outside of kernel/fork.c.
> 
> The whole three-phase "copy - setup - activate" model is a really
> really good thing, and it's how we've done things internally almost
> forever, but I really don't want to expose those middle stages to any
> outsiders.
> 
> So I'd really prefer a simple new "create_io_worker()", even if it's
> literally just some four-line function that does
> 
>    p = copy_process(..);
>    if (!IS_ERR(p)) {
>       block all signals in p
>       set PF_IO_WORKER flag
>       wake_up_new_task(p);
>    }
>    return p;
> 
> I very much want that to be inside kernel/fork.c and have all these
> rules about creating new threads localized there.

I agree, here are the two current patches. Just need to add the signal
blocking, which I'd love to do in create_io_thread(), but seems to
require either an allocation or provide a helper to do it in the thread
itself (with an on-stack mask).

Removes code, even with comment added.

 fs/io-wq.c                 | 68 ++++++++++++++++---------------------------------------------
 fs/io-wq.h                 |  2 --
 fs/io_uring.c              | 29 ++++++++++++++------------
 include/linux/sched/task.h |  2 ++
 kernel/fork.c              | 24 ++++++++++++++++++++++
 5 files changed, 59 insertions(+), 66 deletions(-)


-- 
Jens Axboe

>From 396142d9878cc1a02149616c7032b3e647205341 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@xxxxxxxxx>
Date: Thu, 4 Mar 2021 12:21:05 -0700
Subject: [PATCH 1/2] kernel: provide create_io_thread() helper

Provide a generic helper for setting up an io_uring worker. Returns a
task_struct so that the caller can do whatever setup is needed, then call
wake_up_new_task() to kick it into gear.

Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
---
 include/linux/sched/task.h |  2 ++
 kernel/fork.c              | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index c0f71f2e7160..ef02be869cf2 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -31,6 +31,7 @@ struct kernel_clone_args {
 	/* Number of elements in *set_tid */
 	size_t set_tid_size;
 	int cgroup;
+	int io_thread;
 	struct cgroup *cgrp;
 	struct css_set *cset;
 };
@@ -82,6 +83,7 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index d66cd1014211..549acc6324f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1940,6 +1940,8 @@ static __latent_entropy struct task_struct *copy_process(
 	p = dup_task_struct(current, node);
 	if (!p)
 		goto fork_out;
+	if (args->io_thread)
+		p->flags |= PF_IO_WORKER;
 
 	/*
 	 * This _must_ happen before we call free_task(), i.e. before we jump
@@ -2410,6 +2412,28 @@ struct mm_struct *copy_init_mm(void)
 	return dup_mm(NULL, &init_mm);
 }
 
+/*
+ * This is like kernel_clone(), but shaved down and tailored to just
+ * creating io_uring workers. It returns a created task, or an error pointer.
+ * The returned task is inactive, and the caller must fire it up through
+ * wake_up_new_task(p).
+ */
+struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
+{
+	unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
+				CLONE_IO|SIGCHLD;
+	struct kernel_clone_args args = {
+		.flags		= ((lower_32_bits(flags) | CLONE_VM |
+				    CLONE_UNTRACED) & ~CSIGNAL),
+		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
+		.stack		= (unsigned long)fn,
+		.stack_size	= (unsigned long)arg,
+		.io_thread	= 1,
+	};
+
+	return copy_process(NULL, 0, node, &args);
+}
+
 /*
  *  Ok, this is the main fork-routine.
  *
-- 
2.30.1

>From 9dee8128025806e74c7fd3915294649dc0b11f5f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@xxxxxxxxx>
Date: Thu, 4 Mar 2021 12:39:36 -0700
Subject: [PATCH 2/2] io_uring: move to using create_io_thread()

This allows us to do task creation and setup without needing to use
completions to try and synchronize with the starting thread.

Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
---
 fs/io-wq.c    | 68 +++++++++++++--------------------------------------
 fs/io-wq.h    |  2 --
 fs/io_uring.c | 29 ++++++++++++----------
 3 files changed, 33 insertions(+), 66 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 19f18389ead2..693239ed4de5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -54,7 +54,6 @@ struct io_worker {
 	spinlock_t lock;
 
 	struct completion ref_done;
-	struct completion started;
 
 	struct rcu_head rcu;
 };
@@ -116,7 +115,6 @@ struct io_wq {
 	struct io_wq_hash *hash;
 
 	refcount_t refs;
-	struct completion started;
 	struct completion exited;
 
 	atomic_t worker_refs;
@@ -273,14 +271,6 @@ static void io_wqe_dec_running(struct io_worker *worker)
 		io_wqe_wake_worker(wqe, acct);
 }
 
-static void io_worker_start(struct io_worker *worker)
-{
-	current->flags |= PF_NOFREEZE;
-	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
-	io_wqe_inc_running(worker);
-	complete(&worker->started);
-}
-
 /*
  * Worker will start processing some work. Move it to the busy list, if
  * it's currently on the freelist
@@ -490,8 +480,6 @@ static int io_wqe_worker(void *data)
 	struct io_wqe *wqe = worker->wqe;
 	struct io_wq *wq = wqe->wq;
 
-	io_worker_start(worker);
-
 	while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 loop:
@@ -576,12 +564,6 @@ static int task_thread(void *data, int index)
 	sprintf(buf, "iou-wrk-%d", wq->task_pid);
 	set_task_comm(current, buf);
 
-	current->pf_io_worker = worker;
-	worker->task = current;
-
-	set_cpus_allowed_ptr(current, cpumask_of_node(wqe->node));
-	current->flags |= PF_NO_SETAFFINITY;
-
 	raw_spin_lock_irq(&wqe->lock);
 	hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
 	list_add_tail_rcu(&worker->all_list, &wqe->all_list);
@@ -607,25 +589,10 @@ static int task_thread_unbound(void *data)
 	return task_thread(data, IO_WQ_ACCT_UNBOUND);
 }
 
-pid_t io_wq_fork_thread(int (*fn)(void *), void *arg)
-{
-	unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
-				CLONE_IO|SIGCHLD;
-	struct kernel_clone_args args = {
-		.flags		= ((lower_32_bits(flags) | CLONE_VM |
-				    CLONE_UNTRACED) & ~CSIGNAL),
-		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
-		.stack		= (unsigned long)fn,
-		.stack_size	= (unsigned long)arg,
-	};
-
-	return kernel_clone(&args);
-}
-
 static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 {
 	struct io_worker *worker;
-	pid_t pid;
+	struct task_struct *tsk;
 
 	__set_current_state(TASK_RUNNING);
 
@@ -638,21 +605,26 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
 	worker->wqe = wqe;
 	spin_lock_init(&worker->lock);
 	init_completion(&worker->ref_done);
-	init_completion(&worker->started);
 
 	atomic_inc(&wq->worker_refs);
 
 	if (index == IO_WQ_ACCT_BOUND)
-		pid = io_wq_fork_thread(task_thread_bound, worker);
+		tsk = create_io_thread(task_thread_bound, worker, wqe->node);
 	else
-		pid = io_wq_fork_thread(task_thread_unbound, worker);
-	if (pid < 0) {
+		tsk = create_io_thread(task_thread_unbound, worker, wqe->node);
+	if (IS_ERR(tsk)) {
 		if (atomic_dec_and_test(&wq->worker_refs))
 			complete(&wq->worker_done);
 		kfree(worker);
 		return false;
 	}
-	wait_for_completion(&worker->started);
+	worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
+	io_wqe_inc_running(worker);
+	tsk->pf_io_worker = worker;
+	worker->task = tsk;
+	set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node));
+	tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY;
+	wake_up_new_task(tsk);
 	return true;
 }
 
@@ -752,10 +724,6 @@ static int io_wq_manager(void *data)
 
 	sprintf(buf, "iou-mgr-%d", wq->task_pid);
 	set_task_comm(current, buf);
-	current->flags |= PF_IO_WORKER;
-	wq->manager = get_task_struct(current);
-
-	complete(&wq->started);
 
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -815,21 +783,20 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
 
 static int io_wq_fork_manager(struct io_wq *wq)
 {
-	int ret;
+	struct task_struct *tsk;
 
 	if (wq->manager)
 		return 0;
 
 	reinit_completion(&wq->worker_done);
-	current->flags |= PF_IO_WORKER;
-	ret = io_wq_fork_thread(io_wq_manager, wq);
-	current->flags &= ~PF_IO_WORKER;
-	if (ret >= 0) {
-		wait_for_completion(&wq->started);
+	tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE);
+	if (!IS_ERR(tsk)) {
+		wq->manager = get_task_struct(tsk);
+		wake_up_new_task(tsk);
 		return 0;
 	}
 
-	return ret;
+	return PTR_ERR(tsk);
 }
 
 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
@@ -1062,7 +1029,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 	}
 
 	wq->task_pid = current->pid;
-	init_completion(&wq->started);
 	init_completion(&wq->exited);
 	refcount_set(&wq->refs, 1);
 
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 42f0be64a84d..5fbf7997149e 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -119,8 +119,6 @@ void io_wq_put_and_exit(struct io_wq *wq);
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
 void io_wq_hash_work(struct io_wq_work *work, void *val);
 
-pid_t io_wq_fork_thread(int (*fn)(void *), void *arg);
-
 static inline bool io_wq_is_hashed(struct io_wq_work *work)
 {
 	return work->flags & IO_WQ_WORK_HASHED;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e55369555e5c..04f04ac3c4cf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6677,8 +6677,6 @@ static int io_sq_thread(void *data)
 		set_cpus_allowed_ptr(current, cpu_online_mask);
 	current->flags |= PF_NO_SETAFFINITY;
 
-	complete(&sqd->completion);
-
 	wait_for_completion(&sqd->startup);
 
 	while (!io_sq_thread_should_stop(sqd)) {
@@ -7818,21 +7816,24 @@ void __io_uring_free(struct task_struct *tsk)
 
 static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx)
 {
+	struct task_struct *tsk;
 	int ret;
 
 	clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
 	reinit_completion(&sqd->completion);
 	ctx->sqo_exec = 0;
 	sqd->task_pid = current->pid;
-	current->flags |= PF_IO_WORKER;
-	ret = io_wq_fork_thread(io_sq_thread, sqd);
-	current->flags &= ~PF_IO_WORKER;
-	if (ret < 0) {
+	tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+	if (IS_ERR(tsk)) {
 		sqd->thread = NULL;
 		return ret;
 	}
-	wait_for_completion(&sqd->completion);
-	return io_uring_alloc_task_context(sqd->thread, ctx);
+	sqd->thread = tsk;
+	ret = io_uring_alloc_task_context(tsk, ctx);
+	if (ret)
+		set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+	wake_up_new_task(tsk);
+	return ret;
 }
 
 static int io_sq_offload_create(struct io_ring_ctx *ctx,
@@ -7855,6 +7856,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 		fdput(f);
 	}
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		struct task_struct *tsk;
 		struct io_sq_data *sqd;
 
 		ret = -EPERM;
@@ -7896,15 +7898,16 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 		}
 
 		sqd->task_pid = current->pid;
-		current->flags |= PF_IO_WORKER;
-		ret = io_wq_fork_thread(io_sq_thread, sqd);
-		current->flags &= ~PF_IO_WORKER;
-		if (ret < 0) {
+		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+		if (IS_ERR(tsk)) {
 			sqd->thread = NULL;
 			goto err;
 		}
-		wait_for_completion(&sqd->completion);
 		ret = io_uring_alloc_task_context(sqd->thread, ctx);
+		if (ret)
+			set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+		sqd->thread = tsk;
+		wake_up_new_task(tsk);
 		if (ret)
 			goto err;
 	} else if (p->flags & IORING_SETUP_SQ_AFF) {
-- 
2.30.1


[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux