Re: [PATCH v3 1/2] io_uring: avoid whole io_wq_work copy for requests completed inline

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



hi,

On 28/05/2020 12:15, Xiaoguang Wang wrote:
If requests can be submitted and completed inline, we don't need to
initialize whole io_wq_work in io_init_req(), which is an expensive
operation, add a new 'REQ_F_WORK_INITIALIZED' to control whether
io_wq_work is initialized.

It looks nicer. Especially if you'd add a helper as Jens supposed.
Sure, I'll add a helper in V4, thanks.


The other thing, even though I hate treating a part of the fields differently
from others, I don't like ->creds tossing either.

Did you consider trying using only ->work.creds without adding req->creds? like
in the untested incremental below. init_io_work() there is misleading, should be
somehow played around better.
But if not adding a new req->creds, I think there will be some potential risks.
In current io_uring mainline codes, look at io_kiocb's memory layout
crash> struct -o io_kiocb
struct io_kiocb {
        union {
    [0]     struct file *file;
    [0]     struct io_rw rw;
    [0]     struct io_poll_iocb poll;
    [0]     struct io_accept accept;
    [0]     struct io_sync sync;
    [0]     struct io_cancel cancel;
    [0]     struct io_timeout timeout;
    [0]     struct io_connect connect;
    [0]     struct io_sr_msg sr_msg;
    [0]     struct io_open open;
    [0]     struct io_close close;
    [0]     struct io_files_update files_update;
    [0]     struct io_fadvise fadvise;
    [0]     struct io_madvise madvise;
    [0]     struct io_epoll epoll;
    [0]     struct io_splice splice;
    [0]     struct io_provide_buf pbuf;
        };
   [64] struct io_async_ctx *io;
   [72] int cflags;
   [76] u8 opcode;
   [78] u16 buf_index;
   [80] struct io_ring_ctx *ctx;
   [88] struct list_head list;
  [104] unsigned int flags;
  [108] refcount_t refs;
  [112] struct task_struct *task;
  [120] unsigned long fsize;
  [128] u64 user_data;
  [136] u32 result;
  [140] u32 sequence;
  [144] struct list_head link_list;
  [160] struct list_head inflight_entry;
  [176] struct percpu_ref *fixed_file_refs;
        union {
            struct {
  [184]         struct callback_head task_work;
  [200]         struct hlist_node hash_node;
  [216]         struct async_poll *apoll;
            };
  [184]     struct io_wq_work work;
        };
}
SIZE: 240

struct io_wq_work {
   [0] struct io_wq_work_node list;
   [8] void (*func)(struct io_wq_work **);
  [16] struct files_struct *files;
  [24] struct mm_struct *mm;
  [32] const struct cred *creds;
  [40] struct fs_struct *fs;
  [48] unsigned int flags;
  [52] pid_t task_pid;
}
SIZE: 56

The risk mainly comes from the union:
union {
	/*
	 * Only commands that never go async can use the below fields,
	 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
	 * async armed poll handlers for regular commands. The latter
	 * restore the work, if needed.
	 */
	struct {
		struct callback_head	task_work;
		struct hlist_node	hash_node;
		struct async_poll	*apoll;
	};
	struct io_wq_work	work;
};

1, apoll and creds are in same memory offset, for 'async armed poll handlers' case,
apoll will be used, that means creds will be overwrited. In patch "io_uring: avoid
unnecessary io_wq_work copy for fast poll feature", I use REQ_F_WORK_INITIALIZED
to control whether to do io_wq_work restore, then your below codes will break:

static inline void io_req_work_drop_env(struct io_kiocb *req)
{
	/* always init'ed, put before REQ_F_WORK_INITIALIZED check */
	if (req->work.creds) {
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here req->work.creds will be invalid, or I still need to use some space
to record original req->work.creds, and do creds restore.

		put_cred(req->work.creds);
		req->work.creds = NULL;
	}
	if (!(req->flags & REQ_F_WORK_INITIALIZED))
 		return;

2, For IORING_OP_POLL_ADD case, current mainline codes will use task_work and hash_node,
32 bytes, that means io_wq_work's member list, func, files and mm would be overwrited,
but will not touch creds, it's safe now. But if we will add some new member to
struct {
	struct callback_head	task_work;
	struct hlist_node	hash_node;
	struct async_poll	*apoll;
};
say callback_head adds a new member, our check will still break.

3. IMO, io_wq_work is just to describe needed running environment for reqs that will be
punted to io-wq, for reqs submitted and completed inline should not touch this struct
from software design view, and current io_kiocb is 240 bytes, and a new pointer will be
248 bytes, still 4 cache lines for cache line 64 bytes.


Regards,
Xiaoguang Wang


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4dd3295d74f6..4086561ce444 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -643,7 +643,6 @@ struct io_kiocb {
  	unsigned int		flags;
  	refcount_t		refs;
  	struct task_struct	*task;
-	const struct cred	*creds;
  	unsigned long		fsize;
  	u64			user_data;
  	u32			result;
@@ -894,8 +893,16 @@ static const struct file_operations io_uring_fops;
  static inline void init_io_work(struct io_kiocb *req,
  			void (*func)(struct io_wq_work **))
  {
-	req->work = (struct io_wq_work){ .func = func };
-	req->flags |= REQ_F_WORK_INITIALIZED;
+	struct io_wq_work *work = &req->work;
+
+	/* work->creds are already initialised by a user */
+	work->list.next = NULL;
+	work->func = func;
+	work->files = NULL;
+	work->mm = NULL;
+	work->fs = NULL;
+	work->flags = REQ_F_WORK_INITIALIZED;
+	work->task_pid = 0;
  }
  struct sock *io_uring_get_socket(struct file *file)
  {
@@ -1019,15 +1026,9 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,
  		mmgrab(current->mm);
  		req->work.mm = current->mm;
  	}
+	if (!req->work.creds)
+		req->work.creds = get_current_cred();

-	if (!req->work.creds) {
-		if (!req->creds)
-			req->work.creds = get_current_cred();
-		else {
-			req->work.creds = req->creds;
-			req->creds = NULL;
-		}
-	}
  	if (!req->work.fs && def->needs_fs) {
  		spin_lock(&current->fs->lock);
  		if (!current->fs->in_exec) {
@@ -1044,6 +1045,12 @@ static inline void io_req_work_grab_env(struct io_kiocb *req,

  static inline void io_req_work_drop_env(struct io_kiocb *req)
  {
+	/* always init'ed, put before REQ_F_WORK_INITIALIZED check */
+	if (req->work.creds) {
+		put_cred(req->work.creds);
+		req->work.creds = NULL;
+	}
+
  	if (!(req->flags & REQ_F_WORK_INITIALIZED))
  		return;

@@ -1051,10 +1058,6 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
  		mmdrop(req->work.mm);
  		req->work.mm = NULL;
  	}
-	if (req->work.creds) {
-		put_cred(req->work.creds);
-		req->work.creds = NULL;
-	}
  	if (req->work.fs) {
  		struct fs_struct *fs = req->work.fs;

@@ -5901,12 +5904,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct
io_kiocb *req,

  	id = READ_ONCE(sqe->personality);
  	if (id) {
-		req->creds = idr_find(&ctx->personality_idr, id);
-		if (unlikely(!req->creds))
+		req->work.creds = idr_find(&ctx->personality_idr, id);
+		if (unlikely(!req->work.creds))
  			return -EINVAL;
-		get_cred(req->creds);
+		get_cred(req->work.creds);
  	} else
-		req->creds = NULL;
+		req->work.creds = NULL;

  	/* same numerical values with corresponding REQ_F_*, safe to copy */
  	req->flags |= sqe_flags;




[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux