RE: [PATCH] io-wq: backoff when retrying worker creation

lizetao <lizetao1@xxxxxxxxxx> · Fri, 7 Feb 2025 02:20:43 +0000

Hi,

> -----Original Message-----
> From: Uday Shankar <ushankar@xxxxxxxxxxxxxxx>
> Sent: Friday, February 7, 2025 6:39 AM
> To: Jens Axboe <axboe@xxxxxxxxx>; Pavel Begunkov <asml.silence@xxxxxxxxx>
> Cc: io-uring@xxxxxxxxxxxxxxx; Uday Shankar <ushankar@xxxxxxxxxxxxxxx>
> Subject: [PATCH] io-wq: backoff when retrying worker creation
> 
> When io_uring submission goes async for the first time on a given task, we'll try to
> create a worker thread to handle the submission. Creating this worker thread can
> fail due to various transient conditions, such as an outstanding signal in the forking
> thread, so we have retry logic with a limit of 3 retries. However, this retry logic
> appears to be too aggressive/fast - we've observed a thread blowing through the
> retry limit while having the same outstanding signal the whole time. Here's an
> excerpt of some tracing that demonstrates the issue:
> 
> First, signal 26 is generated for the process. It ends up getting routed to thread
> 92942.
> 
>  0)   cbd-92284    /* signal_generate: sig=26 errno=0 code=-2
> comm=psblkdASD pid=92934 grp=1 res=0 */
> 
> This causes create_io_thread in the signalled thread to fail with ERESTARTNOINTR,
> and thus a retry is queued.
> 
> 13) task_th-92942  /* io_uring_queue_async_work: ring 000000007325c9ae,
> request 0000000080c96d8e, user_data 0x0, opcode URING_CMD, flags
> 0x8240001, normal queue, work 000000006e96dd3f */
> 13) task_th-92942  io_wq_enqueue() {
> 13) task_th-92942    _raw_spin_lock();
> 13) task_th-92942    io_wq_activate_free_worker();
> 13) task_th-92942    _raw_spin_lock();
> 13) task_th-92942    create_io_worker() {
> 13) task_th-92942      __kmalloc_cache_noprof();
> 13) task_th-92942      __init_swait_queue_head();
> 13) task_th-92942      kprobe_ftrace_handler() {
> 13) task_th-92942        get_kprobe();
> 13) task_th-92942        aggr_pre_handler() {
> 13) task_th-92942          pre_handler_kretprobe();
> 13) task_th-92942          /* create_enter: (create_io_thread+0x0/0x50)
> fn=0xffffffff8172c0e0 arg=0xffff888996bb69c0 node=-1 */
> 13) task_th-92942        } /* aggr_pre_handler */
> ...
> 13) task_th-92942        } /* copy_process */
> 13) task_th-92942      } /* create_io_thread */
> 13) task_th-92942      kretprobe_rethook_handler() {
> 13) task_th-92942        /* create_exit: (create_io_worker+0x8a/0x1a0 <-
> create_io_thread) arg1=0xfffffffffffffdff */
> 13) task_th-92942      } /* kretprobe_rethook_handler */
> 13) task_th-92942    queue_work_on() {
> ...
> 
> The CPU is then handed to a kworker to process the queued retry:
> 
> ------------------------------------------
>  13) task_th-92942  => kworker-54154
> ------------------------------------------
> 13) kworker-54154  io_workqueue_create() {
> 13) kworker-54154    io_queue_worker_create() {
> 13) kworker-54154      task_work_add() {
> 13) kworker-54154        wake_up_state() {
> 13) kworker-54154          try_to_wake_up() {
> 13) kworker-54154            _raw_spin_lock_irqsave();
> 13) kworker-54154            _raw_spin_unlock_irqrestore();
> 13) kworker-54154          } /* try_to_wake_up */
> 13) kworker-54154        } /* wake_up_state */
> 13) kworker-54154        kick_process();
> 13) kworker-54154      } /* task_work_add */
> 13) kworker-54154    } /* io_queue_worker_create */
> 13) kworker-54154  } /* io_workqueue_create */
> 
> And then we immediately switch back to the original task to try creating a worker
> again. This fails, because the original task still hasn't handled its signal.
> 
> -----------------------------------------
>  13) kworker-54154  => task_th-92942
> ------------------------------------------
> 13) task_th-92942  create_worker_cont() {
> 13) task_th-92942    kprobe_ftrace_handler() {
> 13) task_th-92942      get_kprobe();
> 13) task_th-92942      aggr_pre_handler() {
> 13) task_th-92942        pre_handler_kretprobe();
> 13) task_th-92942        /* create_enter: (create_io_thread+0x0/0x50)
> fn=0xffffffff8172c0e0 arg=0xffff888996bb69c0 node=-1 */
> 13) task_th-92942      } /* aggr_pre_handler */
> 13) task_th-92942    } /* kprobe_ftrace_handler */
> 13) task_th-92942    create_io_thread() {
> 13) task_th-92942      copy_process() {
> 13) task_th-92942        task_active_pid_ns();
> 13) task_th-92942        _raw_spin_lock_irq();
> 13) task_th-92942        recalc_sigpending();
> 13) task_th-92942        _raw_spin_lock_irq();
> 13) task_th-92942      } /* copy_process */
> 13) task_th-92942    } /* create_io_thread */
> 13) task_th-92942    kretprobe_rethook_handler() {
> 13) task_th-92942      /* create_exit: (create_worker_cont+0x35/0x1b0 <-
> create_io_thread) arg1=0xfffffffffffffdff */
> 13) task_th-92942    } /* kretprobe_rethook_handler */
> 13) task_th-92942    io_worker_release();
> 13) task_th-92942    queue_work_on() {
> 13) task_th-92942      clear_pending_if_disabled();
> 13) task_th-92942      __queue_work() {
> 13) task_th-92942      } /* __queue_work */
> 13) task_th-92942    } /* queue_work_on */
> 13) task_th-92942  } /* create_worker_cont */
> 
> The pattern repeats another couple times until we blow through the retry counter,
> at which point we give up. All outstanding work is canceled, and the io_uring
> command which triggered all this is failed with
> ECANCELED:
> 
> 13) task_th-92942  io_acct_cancel_pending_work() { ...
> 13) task_th-92942  /* io_uring_complete: ring 000000007325c9ae, req
> 0000000080c96d8e, user_data 0x0, result -125, cflags 0x0 extra1 0 extra2 0  */
> 
> Finally, the task gets around to processing its outstanding signal 26, but it's too
> late.
> 
> 13) task_th-92942  /* signal_deliver: sig=26 errno=0 code=-2
> sa_handler=59566a0 sa_flags=14000000 */
> 
> Try to address this issue by adding a small scaling delay when retrying worker
> creation. This should give the forking thread time to handle its signal in the above
> case. This isn't a particularly satisfying solution, as sufficiently paradoxical
> scheduling would still have us hitting the same issue, and I'm open to suggestions
> for something better. But this is likely to prevent this (already rare) issue from
> hitting in practice.
> 
> Signed-off-by: Uday Shankar <ushankar@xxxxxxxxxxxxxxx>
> ---
>  io_uring/io-wq.c | 23 ++++++++++++++++++-----
>  1 file changed, 18 insertions(+), 5 deletions(-)
> 
> diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index
> f7d328feb7225d809601707e423c86a85ebb1c3c..173c77b70060bbbb2cd6009
> 614c079095fab3e3c 100644
> --- a/io_uring/io-wq.c
> +++ b/io_uring/io-wq.c
> @@ -63,7 +63,7 @@ struct io_worker {
> 
>  	union {
>  		struct rcu_head rcu;
> -		struct work_struct work;
> +		struct delayed_work work;
>  	};
>  };
> 
> @@ -784,6 +784,18 @@ static inline bool io_should_retry_thread(struct
> io_worker *worker, long err)
>  	}
>  }
> 
> +static void queue_create_worker_retry(struct io_worker *worker) {
> +	/*
> +	 * We only bother retrying because there's a chance that the
> +	 * failure to create a worker is due to some temporary condition
> +	 * in the forking task (e.g. outstanding signal); give the task
> +	 * some time to clear that condition.
> +	 */
> +	schedule_delayed_work(
> +		&worker->work, msecs_to_jiffies(worker->init_retries * 5)); }

Why choose (worker->init_retries * 5) as the delay time here? Is it the estimated time
to process the signal?

> +
>  static void create_worker_cont(struct callback_head *cb)  {
>  	struct io_worker *worker;
> @@ -823,12 +835,13 @@ static void create_worker_cont(struct callback_head
> *cb)
> 
>  	/* re-create attempts grab a new worker ref, drop the existing one */
>  	io_worker_release(worker);
> -	schedule_work(&worker->work);
> +	queue_create_worker_retry(worker);
>  }
> 
>  static void io_workqueue_create(struct work_struct *work)  {
> -	struct io_worker *worker = container_of(work, struct io_worker, work);
> +	struct io_worker *worker = container_of(
> +		work, struct io_worker, work.work);
>  	struct io_wq_acct *acct = io_wq_get_acct(worker);
> 
>  	if (!io_queue_worker_create(worker, acct, create_worker_cont)) @@ -866,8
> +879,8 @@ static bool create_io_worker(struct io_wq *wq, struct io_wq_acct
> *acct)
>  		kfree(worker);
>  		goto fail;
>  	} else {
> -		INIT_WORK(&worker->work, io_workqueue_create);
> -		schedule_work(&worker->work);
> +		INIT_DELAYED_WORK(&worker->work, io_workqueue_create);
> +		queue_create_worker_retry(worker);
>  	}
> 
>  	return true;
> 
> ---
> base-commit: ec4ef55172d4539abff470568a4369a6e1c317b8
> change-id: 20250206-wq_retry-1cb7f5c1935d
> 
> Best regards,
> --
> Uday Shankar <ushankar@xxxxxxxxxxxxxxx>
> 

---
Li Zetao