Re: [PATCH for-next 01/12] io_uring: infrastructure for retargeting rsrc nodes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 10/31/22 13:41, Dylan Yudaken wrote:
rsrc node cleanup can be indefinitely delayed when there are long lived
requests. For example if a file is located in the same rsrc node as a long
lived socket with multishot poll, then even if unregistering the file it
will not be closed while the poll request is still active.

Introduce a timer when rsrc node is switched, so that periodically we can
retarget these long lived requests to the newest nodes. That will allow
the old nodes to be cleaned up, freeing resources.

Signed-off-by: Dylan Yudaken <dylany@xxxxxxxx>
---
  include/linux/io_uring_types.h |  2 +
  io_uring/io_uring.c            |  1 +
  io_uring/opdef.h               |  1 +
  io_uring/rsrc.c                | 92 ++++++++++++++++++++++++++++++++++
  io_uring/rsrc.h                |  1 +
  5 files changed, 97 insertions(+)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f5b687a787a3..1d4eff4e632c 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -327,6 +327,8 @@ struct io_ring_ctx {
  	struct llist_head		rsrc_put_llist;
  	struct list_head		rsrc_ref_list;
  	spinlock_t			rsrc_ref_lock;
+	struct delayed_work		rsrc_retarget_work;
+	bool				rsrc_retarget_scheduled;
struct list_head io_buffers_pages; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6cc16e39b27f..ea2260359c56 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -320,6 +320,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
  	spin_lock_init(&ctx->rsrc_ref_lock);
  	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
  	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
+	INIT_DELAYED_WORK(&ctx->rsrc_retarget_work, io_rsrc_retarget_work);
  	init_llist_head(&ctx->rsrc_put_llist);
  	init_llist_head(&ctx->work_llist);
  	INIT_LIST_HEAD(&ctx->tctx_list);
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index 3efe06d25473..1b72b14cb5ab 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -37,6 +37,7 @@ struct io_op_def {
  	int (*prep_async)(struct io_kiocb *);
  	void (*cleanup)(struct io_kiocb *);
  	void (*fail)(struct io_kiocb *);
+	bool (*can_retarget_rsrc)(struct io_kiocb *);

side note: need to be split at some moment into 2 tables depending
on hotness, we want better caching for ->issue and ->prep

  };
extern const struct io_op_def io_op_defs[];
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 55d4ab96fb92..106210e0d5d5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -15,6 +15,7 @@
  #include "io_uring.h"
  #include "openclose.h"
  #include "rsrc.h"
+#include "opdef.h"
struct io_rsrc_update {
  	struct file			*file;
@@ -204,6 +205,95 @@ void io_rsrc_put_work(struct work_struct *work)
  	}
  }
+
+static unsigned int io_rsrc_retarget_req(struct io_ring_ctx *ctx,
+					 struct io_kiocb *req)
+	__must_hold(&ctx->uring_lock)
+{
+	if (!req->rsrc_node ||
+	     req->rsrc_node == ctx->rsrc_node)
+		return 0;
+	if (!io_op_defs[req->opcode].can_retarget_rsrc)
+		return 0;
+	if (!(*io_op_defs[req->opcode].can_retarget_rsrc)(req))
+		return 0;

nit, there should be no need to deref fptr.

if (!io_op_defs[req->opcode].can_retarget_rsrc(req)) ...

+
+	io_rsrc_put_node(req->rsrc_node, 1);
+	req->rsrc_node = ctx->rsrc_node;
+	return 1;
+}
+
+static unsigned int io_rsrc_retarget_table(struct io_ring_ctx *ctx,
+				   struct io_hash_table *table)
+{
+	unsigned int nr_buckets = 1U << table->hash_bits;
+	unsigned int refs = 0;
+	struct io_kiocb *req;
+	int i;
+
+	for (i = 0; i < nr_buckets; i++) {
+		struct io_hash_bucket *hb = &table->hbs[i];
+
+		spin_lock(&hb->lock);
+		hlist_for_each_entry(req, &hb->list, hash_node)
+			refs += io_rsrc_retarget_req(ctx, req);
+		spin_unlock(&hb->lock);
+	}
+	return refs;
+}
+
+static void io_rsrc_retarget_schedule(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	percpu_ref_get(&ctx->refs);
+	mod_delayed_work(system_wq, &ctx->rsrc_retarget_work, 60 * HZ);
+	ctx->rsrc_retarget_scheduled = true;
+}
+
+static void __io_rsrc_retarget_work(struct io_ring_ctx *ctx)
+	__must_hold(&ctx->uring_lock)
+{
+	struct io_rsrc_node *node;
+	unsigned int refs;
+	bool any_waiting;
+
+	if (!ctx->rsrc_node)
+		return;
+
+	spin_lock_irq(&ctx->rsrc_ref_lock);
+	any_waiting = false;
+	list_for_each_entry(node, &ctx->rsrc_ref_list, node) {
+		if (!node->done) {
+			any_waiting = true;
+			break;
+		}
+	}
+	spin_unlock_irq(&ctx->rsrc_ref_lock);
+
+	if (!any_waiting)
+		return;
+
+	refs = io_rsrc_retarget_table(ctx, &ctx->cancel_table);
+	refs += io_rsrc_retarget_table(ctx, &ctx->cancel_table_locked);
+
+	ctx->rsrc_cached_refs -= refs;
+	while (unlikely(ctx->rsrc_cached_refs < 0))
+		io_rsrc_refs_refill(ctx);

We can charge ->rsrc_cached_refs after setting up nodes in prep / submission
without underflowing the actual refs because we know that the requests are
not yet submitted and so won't consume refs. This one looks more troublesome


+}
+
+void io_rsrc_retarget_work(struct work_struct *work)
+{
+	struct io_ring_ctx *ctx;
+
+	ctx = container_of(work, struct io_ring_ctx, rsrc_retarget_work.work);
+
+	mutex_lock(&ctx->uring_lock);
+	ctx->rsrc_retarget_scheduled = false;
+	__io_rsrc_retarget_work(ctx);
+	mutex_unlock(&ctx->uring_lock);
+	percpu_ref_put(&ctx->refs);
+}
+
  void io_wait_rsrc_data(struct io_rsrc_data *data)
  {
  	if (data && !atomic_dec_and_test(&data->refs))
@@ -285,6 +375,8 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx,
  		atomic_inc(&data_to_kill->refs);
  		percpu_ref_kill(&rsrc_node->refs);
  		ctx->rsrc_node = NULL;
+		if (!ctx->rsrc_retarget_scheduled)
+			io_rsrc_retarget_schedule(ctx);
  	}
if (!ctx->rsrc_node) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 81445a477622..2b94df8fd9e8 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -54,6 +54,7 @@ struct io_mapped_ubuf {
  };
void io_rsrc_put_work(struct work_struct *work);
+void io_rsrc_retarget_work(struct work_struct *work);
  void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
  void io_wait_rsrc_data(struct io_rsrc_data *data);
  void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);

--
Pavel Begunkov



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux