On 2021-05-21 17:49, Paul Moore wrote: > WARNING - This is a work in progress and should not be merged > anywhere important. It is almost surely not complete, and while it > probably compiles it likely hasn't been booted and will do terrible > things. You have been warned. > > This patch adds basic auditing to io_uring operations, regardless of > their context. This is accomplished by allocating audit_context > structures for the io-wq worker and io_uring SQPOLL kernel threads > as well as explicitly auditing the io_uring operations in > io_issue_sqe(). The io_uring operations are audited using a new > AUDIT_URINGOP record, an example is shown below: > > % <TODO - insert AUDIT_URINGOP record example> > > Thanks to Richard Guy Briggs for review and feedback. > > Signed-off-by: Paul Moore <paul@xxxxxxxxxxxxxx> > --- > fs/io-wq.c | 4 + > fs/io_uring.c | 11 +++ > include/linux/audit.h | 17 ++++ > include/uapi/linux/audit.h | 1 > kernel/audit.h | 2 + > kernel/auditsc.c | 173 ++++++++++++++++++++++++++++++++++++++++++++ > 6 files changed, 208 insertions(+) > > diff --git a/fs/io-wq.c b/fs/io-wq.c > index 5361a9b4b47b..8af09a3336e0 100644 > --- a/fs/io-wq.c > +++ b/fs/io-wq.c > @@ -16,6 +16,7 @@ > #include <linux/rculist_nulls.h> > #include <linux/cpu.h> > #include <linux/tracehook.h> > +#include <linux/audit.h> > > #include "io-wq.h" > > @@ -535,6 +536,8 @@ static int io_wqe_worker(void *data) > snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); > set_task_comm(current, buf); > > + audit_alloc_kernel(current); > + > while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { > long ret; > > @@ -573,6 +576,7 @@ static int io_wqe_worker(void *data) > raw_spin_unlock_irq(&wqe->lock); > } > > + audit_free(current); > io_worker_exit(worker); > return 0; > } > diff --git a/fs/io_uring.c b/fs/io_uring.c > index e481ac8a757a..e9941d1ad8fd 100644 > --- a/fs/io_uring.c > +++ b/fs/io_uring.c > @@ -78,6 +78,7 @@ > #include <linux/task_work.h> > #include <linux/pagemap.h> > #include <linux/io_uring.h> > +#include <linux/audit.h> > > #define CREATE_TRACE_POINTS > #include <trace/events/io_uring.h> > @@ -6105,6 +6106,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) > if (req->work.creds && req->work.creds != current_cred()) > creds = override_creds(req->work.creds); > > + if (req->opcode < IORING_OP_LAST) > + audit_uring_entry(req->opcode); > + > switch (req->opcode) { > case IORING_OP_NOP: > ret = io_nop(req, issue_flags); > @@ -6211,6 +6215,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) > break; > } > > + if (req->opcode < IORING_OP_LAST) > + audit_uring_exit(!ret, ret); > + > if (creds) > revert_creds(creds); > > @@ -6827,6 +6834,8 @@ static int io_sq_thread(void *data) > set_cpus_allowed_ptr(current, cpu_online_mask); > current->flags |= PF_NO_SETAFFINITY; > > + audit_alloc_kernel(current); > + > mutex_lock(&sqd->lock); > /* a user may had exited before the thread started */ > io_run_task_work_head(&sqd->park_task_work); > @@ -6916,6 +6925,8 @@ static int io_sq_thread(void *data) > io_run_task_work_head(&sqd->park_task_work); > mutex_unlock(&sqd->lock); > > + audit_free(current); > + > complete(&sqd->exited); > do_exit(0); > } > diff --git a/include/linux/audit.h b/include/linux/audit.h > index 82b7c1116a85..6a0c013bc7de 100644 > --- a/include/linux/audit.h > +++ b/include/linux/audit.h > @@ -286,7 +286,10 @@ static inline int audit_signal_info(int sig, struct task_struct *t) > /* These are defined in auditsc.c */ > /* Public API */ > extern int audit_alloc(struct task_struct *task); > +extern int audit_alloc_kernel(struct task_struct *task); > extern void __audit_free(struct task_struct *task); > +extern void __audit_uring_entry(u8 op); > +extern void __audit_uring_exit(int success, long code); > extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, > unsigned long a2, unsigned long a3); > extern void __audit_syscall_exit(int ret_success, long ret_value); > @@ -323,6 +326,16 @@ static inline void audit_free(struct task_struct *task) > if (unlikely(task->audit_context)) > __audit_free(task); > } > +static inline void audit_uring_entry(u8 op) > +{ > + if (unlikely(audit_context())) > + __audit_uring_entry(op); > +} > +static inline void audit_uring_exit(int success, long code) > +{ > + if (unlikely(audit_context())) > + __audit_uring_exit(success, code); > +} > static inline void audit_syscall_entry(int major, unsigned long a0, > unsigned long a1, unsigned long a2, > unsigned long a3) > @@ -554,6 +567,10 @@ static inline int audit_alloc(struct task_struct *task) > { > return 0; > } > +static inline int audit_alloc_kernel(struct task_struct *task) > +{ > + return 0; > +} > static inline void audit_free(struct task_struct *task) > { } > static inline void audit_syscall_entry(int major, unsigned long a0, > diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h > index cd2d8279a5e4..b26e0c435e8b 100644 > --- a/include/uapi/linux/audit.h > +++ b/include/uapi/linux/audit.h > @@ -118,6 +118,7 @@ > #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ > #define AUDIT_BPF 1334 /* BPF subsystem */ > #define AUDIT_EVENT_LISTENER 1335 /* Task joined multicast read socket */ > +#define AUDIT_URINGOP 1336 /* io_uring operation */ > > #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ > #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ > diff --git a/kernel/audit.h b/kernel/audit.h > index fba180de5912..50de827497ca 100644 > --- a/kernel/audit.h > +++ b/kernel/audit.h > @@ -100,10 +100,12 @@ struct audit_context { > enum { > AUDIT_CTX_UNUSED, /* audit_context is currently unused */ > AUDIT_CTX_SYSCALL, /* in use by syscall */ > + AUDIT_CTX_URING, /* in use by io_uring */ > } context; > enum audit_state state, current_state; > unsigned int serial; /* serial number for record */ > int major; /* syscall number */ > + int uring_op; /* uring operation */ > struct timespec64 ctime; /* time of syscall entry */ > unsigned long argv[4]; /* syscall arguments */ > long return_code;/* syscall return code */ > diff --git a/kernel/auditsc.c b/kernel/auditsc.c > index cc89e9f9a753..729849d41631 100644 > --- a/kernel/auditsc.c > +++ b/kernel/auditsc.c > @@ -953,6 +953,7 @@ static void audit_reset_context(struct audit_context *ctx) > ctx->current_state = ctx->state; > ctx->serial = 0; > ctx->major = 0; > + ctx->uring_op = 0; > ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 }; > memset(ctx->argv, 0, sizeof(ctx->argv)); > ctx->return_code = 0; > @@ -1038,6 +1039,31 @@ int audit_alloc(struct task_struct *tsk) > return 0; > } > > +/** > + * audit_alloc_kernel - allocate an audit_context for a kernel task > + * @tsk: the kernel task > + * > + * Similar to the audit_alloc() function, but intended for kernel private > + * threads. Returns zero on success, negative values on failure. > + */ > +int audit_alloc_kernel(struct task_struct *tsk) > +{ > + /* > + * At the moment we are just going to call into audit_alloc() to > + * simplify the code, but there two things to keep in mind with this > + * approach: > + * > + * 1. Filtering internal kernel tasks is a bit laughable in almost all > + * cases, but there is at least one case where there is a benefit: > + * the '-a task,never' case allows the admin to effectively disable > + * task auditing at runtime. > + * > + * 2. The {set,clear}_task_syscall_work() ops likely have zero effect > + * on these internal kernel tasks, but they probably don't hurt either. > + */ > + return audit_alloc(tsk); > +} > + > static inline void audit_free_context(struct audit_context *context) > { > /* resetting is extra work, but it is likely just noise */ > @@ -1536,6 +1562,52 @@ static void audit_log_proctitle(void) > audit_log_end(ab); > } > > +/** > + * audit_log_uring - generate a AUDIT_URINGOP record > + * @ctx: the audit context > + */ > +static void audit_log_uring(struct audit_context *ctx) > +{ > + struct audit_buffer *ab; > + const struct cred *cred; > + > + /* > + * TODO: What do we log here? I'm tossing in a few things to start the > + * conversation, but additional thought needs to go into this. > + */ > + > + ab = audit_log_start(ctx, GFP_KERNEL, AUDIT_URINGOP); > + if (!ab) > + return; > + cred = current_cred(); This may need to be req->work.creds. I haven't been following if the io_uring thread inherited the user task's creds (and below, comm and exe). > + audit_log_format(ab, "uring_op=%d", ctx->uring_op); arch is stored below in __audit_uring_entry() and never used in the AUDIT_CTX_URING case. That assignment can either be dropped or printed before uring_op similar to the SYSCALL record. There aren't really any arg[0-3] to print. io_uring_register and io_uring_setup() args are better covered by other records. io_uring_enter() has 6 args and the last two aren't covered by SYSCALL anyways. > + if (ctx->return_valid != AUDITSC_INVALID) > + audit_log_format(ab, " success=%s exit=%ld", > + (ctx->return_valid == AUDITSC_SUCCESS ? > + "yes" : "no"), > + ctx->return_code); > + audit_log_format(ab, > + " items=%d" > + " ppid=%d pid=%d auid=%u uid=%u gid=%u" > + " euid=%u suid=%u fsuid=%u" > + " egid=%u sgid=%u fsgid=%u", > + ctx->name_count, > + task_ppid_nr(current), > + task_tgid_nr(current), > + from_kuid(&init_user_ns, audit_get_loginuid(current)), > + from_kuid(&init_user_ns, cred->uid), > + from_kgid(&init_user_ns, cred->gid), > + from_kuid(&init_user_ns, cred->euid), > + from_kuid(&init_user_ns, cred->suid), > + from_kuid(&init_user_ns, cred->fsuid), > + from_kgid(&init_user_ns, cred->egid), > + from_kgid(&init_user_ns, cred->sgid), > + from_kgid(&init_user_ns, cred->fsgid)); The audit session ID is still important, relevant and qualifies auid. In keeping with the SYSCALL record format, I think we want to keep ses=audit_get_sessionid(current) in here. I'm pretty sure we also want to keep comm= and exe= too, but may have to reach into req->task to get it. There are two values for comm possible, one from the original task and second "iou-sqp-<pid>" set at the top of io_sq_thread(). I'm reluctant to leave them out now and then have to re-add them in yet another field order later. > + audit_log_task_context(ab); > + audit_log_key(ab, ctx->filterkey); > + audit_log_end(ab); > +} > + > static void audit_log_exit(void) > { > int i, call_panic = 0; > @@ -1571,6 +1643,9 @@ static void audit_log_exit(void) > audit_log_key(ab, context->filterkey); > audit_log_end(ab); > break; > + case AUDIT_CTX_URING: > + audit_log_uring(context); > + break; > default: > BUG(); > break; > @@ -1740,6 +1815,104 @@ static void audit_return_fixup(struct audit_context *ctx, > ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE); > } > > +/** > + * __audit_uring_entry - prepare the kernel task's audit context for io_uring > + * @op: the io_uring opcode > + * > + * This is similar to audit_syscall_entry() but is intended for use by io_uring > + * operations. > + */ > +void __audit_uring_entry(u8 op) > +{ > + struct audit_context *ctx = audit_context(); > + > + if (!audit_enabled || !ctx || ctx->state == AUDIT_DISABLED) > + return; > + > + /* > + * NOTE: It's possible that we can be called from the process' context > + * before it returns to userspace, and before audit_syscall_exit() > + * is called. In this case there is not much to do, just record > + * the io_uring details and return. > + */ > + ctx->uring_op = op; > + if (ctx->context == AUDIT_CTX_SYSCALL) > + return; > + > + ctx->dummy = !audit_n_rules; > + if (!ctx->dummy && ctx->state == AUDIT_BUILD_CONTEXT) > + ctx->prio = 0; > + > + ctx->arch = syscall_get_arch(current); > + ctx->context = AUDIT_CTX_URING; > + ctx->current_state = ctx->state; > + ktime_get_coarse_real_ts64(&ctx->ctime); > +} > + > +/** > + * __audit_uring_exit - wrap up the kernel task's audit context after io_uring > + * @success: true/false value to indicate if the operation succeeded or not > + * @code: operation return code > + * > + * This is similar to audit_syscall_exit() but is intended for use by io_uring > + * operations. > + */ > +void __audit_uring_exit(int success, long code) > +{ > + struct audit_context *ctx = audit_context(); > + > + /* > + * TODO: At some point we will likely want to filter on io_uring ops > + * and other things similar to what we do for syscalls, but that > + * is something for another day; just record what we can here. > + */ > + > + if (!ctx || ctx->dummy) > + goto out; > + if (ctx->context == AUDIT_CTX_SYSCALL) { > + /* > + * NOTE: See the note in __audit_uring_entry() about the case > + * where we may be called from process context before we > + * return to userspace via audit_syscall_exit(). In this > + * case we simply emit a URINGOP record and bail, the > + * normal syscall exit handling will take care of > + * everything else. > + * It is also worth mentioning that when we are called, > + * the current process creds may differ from the creds > + * used during the normal syscall processing; keep that > + * in mind if/when we move the record generation code. > + */ > + > + /* > + * We need to filter on the syscall info here to decide if we > + * should emit a URINGOP record. I know it seems odd but this > + * solves the problem where users have a filter to block *all* > + * syscall records in the "exit" filter; we want to preserve > + * the behavior here. > + */ > + audit_filter_syscall(current, ctx); > + audit_filter_inodes(current, ctx); > + if (ctx->current_state != AUDIT_RECORD_CONTEXT) > + return; > + > + audit_log_uring(ctx); > + return; > + } > + > + /* this may generate CONFIG_CHANGE records */ > + if (!list_empty(&ctx->killed_trees)) > + audit_kill_trees(ctx); > + > + audit_filter_inodes(current, ctx); > + if (ctx->current_state != AUDIT_RECORD_CONTEXT) > + goto out; > + audit_return_fixup(ctx, success, code); > + audit_log_exit(); > + > +out: > + audit_reset_context(ctx); > +} > + > /** > * __audit_syscall_entry - fill in an audit record at syscall entry > * @major: major syscall type (function) > > -- > Linux-audit mailing list > Linux-audit@xxxxxxxxxx > https://listman.redhat.com/mailman/listinfo/linux-audit - RGB -- Richard Guy Briggs <rgb@xxxxxxxxxx> Sr. S/W Engineer, Kernel Security, Base Operating Systems Remote, Ottawa, Red Hat Canada IRC: rgb, SunRaycer Voice: +1.647.777.2635, Internal: (81) 32635