On Mon, 15 Nov 2021 16:50:37 -0800 Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> wrote: > diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c > new file mode 100644 > index 000000000000..80d1ce21d713 > --- /dev/null > +++ b/kernel/trace/trace_events_user.c > @@ -0,0 +1,1159 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (c) 2021, Microsoft Corporation. > + * > + * Authors: > + * Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> > + */ > + > +#include <linux/bitmap.h> > +#include <linux/cdev.h> > +#include <linux/hashtable.h> > +#include <linux/list.h> > +#include <linux/io.h> > +#include <linux/uio.h> > +#include <linux/ioctl.h> > +#include <linux/jhash.h> > +#include <linux/trace_events.h> > +#include <linux/tracefs.h> > +#include <linux/types.h> > +#include <linux/uaccess.h> > +#include <uapi/linux/user_events.h> > +#include "trace.h" > +#include "trace_dynevent.h" > + > +#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1) > + > +#define FIELD_DEPTH_TYPE 0 > +#define FIELD_DEPTH_NAME 1 > +#define FIELD_DEPTH_SIZE 2 > + > +/* > + * Limits how many trace_event calls user processes can create: > + * Must be multiple of PAGE_SIZE. > + */ > +#define MAX_PAGES 1 > +#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) > + > +/* Limit how long of an event name plus args within the subsystem. */ > +#define MAX_EVENT_DESC 512 > +#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) > +#define MAX_FIELD_ARRAY_SIZE (2 * PAGE_SIZE) You really want to allow field sizes to be larger than a page? > + > +static char *register_page_data; > + > +static DEFINE_MUTEX(reg_mutex); > +static DEFINE_HASHTABLE(register_table, 4); > +static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); > + > +struct user_event { > + struct tracepoint tracepoint; > + struct trace_event_call call; > + struct trace_event_class class; > + struct dyn_event devent; > + struct hlist_node node; > + struct list_head fields; > + atomic_t refcnt; > + int index; > + int flags; > +}; > + > +struct user_event_refs { > + struct rcu_head rcu; > + int count; > + struct user_event *events[]; > +}; > + > +typedef void (*user_event_func_t) (struct user_event *user, > + void *data, u32 datalen, > + void *tpdata); > + > +static int user_event_parse(char *name, char *args, char *flags, > + struct user_event **newuser); > + [..] > + > +/* > + * Validates the user payload and writes via iterator. > + */ > +static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) > +{ > + struct user_event_refs *refs; > + struct user_event *user = NULL; > + struct tracepoint *tp; > + ssize_t ret = i->count; > + int idx; > + > + if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx))) > + return -EFAULT; > + > + rcu_read_lock_sched(); > + > + refs = rcu_dereference_sched(file->private_data); > + Still need a comment here that states: /* * The refs->events array is protected by RCU, and new items may be * added. But the user retrieved from indexing into the events array * shall be immutable while the file is opened. */ Or something like that, because once again, I had to figure out how this all worked. It's not obvious. > + if (likely(refs && idx < refs->count)) > + user = refs->events[idx]; > + > + rcu_read_unlock_sched(); > + > + if (unlikely(user == NULL)) > + return -ENOENT; > + > + tp = &user->tracepoint; > + Probably also should have a comment here that states that this is racy, but we don't care. That is, the enabled can be less than 1 after the if statement, but that means it was just disabled and it's OK if a few events slip in after it is. > + if (likely(atomic_read(&tp->key.enabled) > 0)) { > + struct tracepoint_func *probe_func_ptr; > + user_event_func_t probe_func; > + void *tpdata; > + void *kdata; > + u32 datalen; > + > + kdata = kmalloc(i->count, GFP_KERNEL); > + > + if (unlikely(!kdata)) > + return -ENOMEM; > + > + datalen = copy_from_iter(kdata, i->count, i); > + > + rcu_read_lock_sched(); > + > + probe_func_ptr = rcu_dereference_sched(tp->funcs); > + > + if (probe_func_ptr) { > + do { > + probe_func = probe_func_ptr->func; > + tpdata = probe_func_ptr->data; > + probe_func(user, kdata, datalen, tpdata); > + } while ((++probe_func_ptr)->func); > + } > + > + rcu_read_unlock_sched(); > + > + kfree(kdata); > + } > + > + return ret; > +} > + > +static ssize_t user_events_write(struct file *file, const char __user *ubuf, > + size_t count, loff_t *ppos) > +{ > + struct iovec iov; > + struct iov_iter i; > + > + if (unlikely(*ppos != 0)) > + return -EFAULT; > + > + if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) > + return -EFAULT; > + > + return user_events_write_core(file, &i); > +} > + > +static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) > +{ > + return user_events_write_core(kp->ki_filp, i); > +} > + > +static int user_events_ref_add(struct file *file, struct user_event *user) > +{ > + struct user_event_refs *refs, *new_refs; > + int i, size, count = 0; > + > + refs = rcu_dereference_protected(file->private_data, > + lockdep_is_held(®_mutex)); > + > + if (refs) { > + count = refs->count; > + > + for (i = 0; i < count; ++i) > + if (refs->events[i] == user) > + return i; > + } > + > + size = sizeof(*refs) + (sizeof(struct user_event *) * (count + 1)); Use: size = struct_size(refs, events, count + 1); > + > + new_refs = kzalloc(size, GFP_KERNEL); > + > + if (!new_refs) > + return -ENOMEM; > + > + new_refs->count = count + 1; > + > + for (i = 0; i < count; ++i) > + new_refs->events[i] = refs->events[i]; > + > + new_refs->events[i] = user; > + > + atomic_inc(&user->refcnt); > + > + rcu_assign_pointer(file->private_data, new_refs); > + > + if (refs) > + kfree_rcu(refs, rcu); > + > + return i; > +} > + > +static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) > +{ > + u32 size; > + long ret; > + > + ret = get_user(size, &ureg->size); > + > + if (ret) > + return ret; > + > + if (size > PAGE_SIZE) > + return -E2BIG; > + > + return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); > +} > + > +/* > + * Registers a user_event on behalf of a user process. > + */ > +static long user_events_ioctl_reg(struct file *file, unsigned long uarg) > +{ > + struct user_reg __user *ureg = (struct user_reg __user *)uarg; > + struct user_reg reg; > + struct user_event *user; > + char *name; > + long ret; > + > + ret = user_reg_get(ureg, ®); > + > + if (ret) > + return ret; > + > + name = strndup_user((const char __user *)(uintptr_t)reg.name_args, > + MAX_EVENT_DESC); > + > + if (IS_ERR(name)) { > + ret = PTR_ERR(name); > + return ret; > + } > + > + ret = user_event_parse_cmd(name, &user); > + > + if (ret) { > + kfree(name); > + return ret; > + } > + > + ret = user_events_ref_add(file, user); > + > + /* Positive number is index and valid */ > + if (ret < 0) > + return ret; > + > + put_user((u32)ret, &ureg->write_index); > + put_user(user->index, &ureg->status_index); > + > + return 0; > +} > + > +/* > + * Deletes a user_event on behalf of a user process. > + */ > +static long user_events_ioctl_del(struct file *file, unsigned long uarg) > +{ > + void __user *ubuf = (void __user *)uarg; > + char *name; > + long ret; > + > + name = strndup_user(ubuf, MAX_EVENT_DESC); > + > + if (IS_ERR(name)) > + return PTR_ERR(name); > + > + ret = delete_user_event(name); > + > + kfree(name); > + > + return ret; > +} > + > +/* > + * Handles the ioctl from user mode to register or alter operations. > + */ > +static long user_events_ioctl(struct file *file, unsigned int cmd, > + unsigned long uarg) > +{ > + long ret = -ENOTTY; > + > + switch (cmd) { > + case DIAG_IOCSREG: > + mutex_lock(®_mutex); > + ret = user_events_ioctl_reg(file, uarg); > + mutex_unlock(®_mutex); > + break; > + > + case DIAG_IOCSDEL: > + mutex_lock(®_mutex); > + ret = user_events_ioctl_del(file, uarg); > + mutex_unlock(®_mutex); > + break; > + } > + > + return ret; > +} > + > +/* > + * Handles the final close of the file from user mode. > + */ > +static int user_events_release(struct inode *node, struct file *file) > +{ > + struct user_event_refs *refs; > + struct user_event *user; > + int i; > + > + rcu_read_lock_sched(); > + refs = rcu_dereference_sched(file->private_data); > + rcu_read_unlock_sched(); This looks totally wrong. What is RCU protecting here? > + > + if (!refs) > + goto out; > + How can't refs->events not change here? Shouldn't reg_mutex be held throughout this function? > + for (i = 0; i < refs->count; ++i) { > + user = refs->events[i]; > + > + if (user) > + atomic_dec(&user->refcnt); > + } > + > + kfree_rcu(refs, rcu); > +out: > + return 0; > +} > + -- Steve