[PATCH RFC 9/9] io_uring: Introduce IORING_OP_EXEC command

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Josh Triplett <josh@xxxxxxxxxxxxxxxx>

This command executes the equivalent of an execveat(2) in a previously
spawned io_uring context, causing the execution to return to a new
program indicated by the SQE.

As an io_uring command, it is special in a few ways, requiring some
quirks. First, it can only be executed from the spawned context linked
after the IORING_OP_CLONE command; In addition, the first successful
IORING_OP_EXEC command will terminate the link chain, causing
further operations to fail with -ECANCELED.

There are a few reason for the first limitation: First, it wouldn't make
much sense to execute IORING_OP_EXEC in an io-wq, as it would simply
mean "stealing" the worker thread from io_uring; It would also be
questionable to execute inline or in a task work, as it would terminate
the execution of the ring.  Another technical reason is that we'd
immediately deadlock (fixable), because we'd need to complete the
command and release the reference after returning from the execve, but
the context has already been invalidated by terminating the process.
All in all, considering io_uring's purpose to provide an asynchronous
interface, I'd (Gabriel) like to focus on the simple use-case first,
limiting it to the cloned context for now.

The second limitation is obvious.  We reject further operations on the
link after a successful exec because that is the boundary of the new
program.

There is a very interesting usecase that Josh mentioned for this
feature.  One can issue a series of hardlinked IORING_OP_EXEC using the
different $PATH components to search for the binary and try them in
sequence without returning to userspace.  This is exemplified in
the liburing testcase accompanying the patchset.

Signed-off-by: Josh Triplett <josh@xxxxxxxxxxxxxxxx>
Co-developed-by: Gabriel Krisman Bertazi <krisman@xxxxxxx>
Signed-off-by: Gabriel Krisman Bertazi <krisman@xxxxxxx>
---
 include/uapi/linux/io_uring.h |  2 ++
 io_uring/opdef.c              |  9 ++++++
 io_uring/spawn.c              | 57 ++++++++++++++++++++++++++++++++++-
 io_uring/spawn.h              |  3 ++
 4 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 82d8dae49645..1116ff8b5018 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -73,6 +73,7 @@ struct io_uring_sqe {
 		__u32		futex_flags;
 		__u32		install_fd_flags;
 		__u32		nop_flags;
+		__u32		execve_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -279,6 +280,7 @@ enum io_uring_op {
 	IORING_OP_BIND,
 	IORING_OP_LISTEN,
 	IORING_OP_CLONE,
+	IORING_OP_EXEC,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 1bab2e517e55..8cca077641d5 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -521,6 +521,12 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_clone_prep,
 		.issue			= io_clone,
 	},
+	[IORING_OP_EXEC] = {
+		.audit_skip		= 1,
+		.ignore_creds		= 1,
+		.prep			= io_exec_prep,
+		.issue			= io_exec,
+	},
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -753,6 +759,9 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_CLONE] = {
 		.name			= "CLONE",
 	},
+	[IORING_OP_EXEC] = {
+		.name			= "EXEC",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/spawn.c b/io_uring/spawn.c
index 59d6ccf96f45..d6d649f78906 100644
--- a/io_uring/spawn.c
+++ b/io_uring/spawn.c
@@ -18,6 +18,16 @@ struct io_clone {
 	struct io_kiocb *link;
 };
 
+struct io_exec {
+	struct file *file_unused;
+	const char __user *filename;
+	const char __user *const __user *argv;
+	const char __user *const __user *envp;
+
+	int dfd;
+	u32 flags;
+};
+
 static void fail_link(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt;
@@ -38,6 +48,7 @@ static int io_uring_spawn_task(void *data)
 	struct io_clone *c = io_kiocb_to_cmd(head, struct io_clone);
 	struct io_ring_ctx *ctx = head->ctx;
 	struct io_kiocb *req, *next;
+	bool return_to_user = false;
 	int err;
 
 	set_task_comm(current, "iou-spawn");
@@ -67,6 +78,15 @@ static int io_uring_spawn_task(void *data)
 				fail_link(next);
 				break;
 			}
+		} else if (req->opcode == IORING_OP_EXEC) {
+			/*
+			 * Don't execute anything after the first
+			 * successful IORING_OP_EXEC.  Cancel the rest
+			 * of the link and allow userspace to return
+			 */
+			fail_link(next);
+			return_to_user = true;
+			break;
 		}
 	}
 
@@ -75,7 +95,9 @@ static int io_uring_spawn_task(void *data)
 
 	mutex_unlock(&ctx->uring_lock);
 
-	force_exit_sig(SIGKILL);
+	/* If there wasn't a successful exec, terminate the thread. */
+	if (!return_to_user)
+		force_exit_sig(SIGKILL);
 	return 0;
 }
 
@@ -138,3 +160,36 @@ int io_clone(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 
 }
+
+int io_exec_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_exec *e = io_kiocb_to_cmd(req, typeof(*e));
+
+	if (unlikely(sqe->buf_index || sqe->len || sqe->file_index))
+		return -EINVAL;
+
+	e->dfd = READ_ONCE(sqe->fd);
+	e->filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	e->argv = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	e->envp = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+	e->flags = READ_ONCE(sqe->execve_flags);
+	return 0;
+}
+
+int io_exec(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_exec *e = io_kiocb_to_cmd(req, typeof(*e));
+	int ret;
+
+	if (!(issue_flags & IO_URING_F_SPAWN))
+		return -EINVAL;
+
+	ret = do_execveat(e->dfd, getname(e->filename),
+			  e->argv, e->envp, e->flags);
+	if (ret < 0) {
+		req_set_fail(req);
+		io_req_set_res(req, ret, 0);
+	}
+	return IOU_OK;
+
+}
diff --git a/io_uring/spawn.h b/io_uring/spawn.h
index 9b7ddb776d1e..93d9f0ae378c 100644
--- a/io_uring/spawn.h
+++ b/io_uring/spawn.h
@@ -8,3 +8,6 @@
 
 int io_clone_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_clone(struct io_kiocb *req, unsigned int issue_flags);
+
+int io_exec_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_exec(struct io_kiocb *req, unsigned int issue_flags);
-- 
2.47.0





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux