Hi Beau, On Thu, 16 Dec 2021 09:35:00 -0800 Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> wrote: > Minimal support for interacting with dynamic events, trace_event and > ftrace. Since the cover mail is merged, could you describe what is the user_events here? :) I have some comments below, but not so much. > Core outline of flow between user process, ioctl and trace_event > APIs. > > Signed-off-by: Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> > --- > include/uapi/linux/user_events.h | 71 ++ > kernel/trace/Kconfig | 14 + > kernel/trace/Makefile | 1 + > kernel/trace/trace_events_user.c | 1188 ++++++++++++++++++++++++++++++ > 4 files changed, 1274 insertions(+) > create mode 100644 include/uapi/linux/user_events.h > create mode 100644 kernel/trace/trace_events_user.c > > diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h > new file mode 100644 > index 000000000000..f97db05e00c9 > --- /dev/null > +++ b/include/uapi/linux/user_events.h > @@ -0,0 +1,71 @@ > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ > +/* > + * Copyright (c) 2021, Microsoft Corporation. > + * > + * Authors: > + * Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> > + */ > +#ifndef _UAPI_LINUX_USER_EVENTS_H > +#define _UAPI_LINUX_USER_EVENTS_H > + > +#include <linux/types.h> > +#include <linux/ioctl.h> > + > +#ifdef __KERNEL__ > +#include <linux/uio.h> > +#else > +#include <sys/uio.h> > +#endif > + > +#define USER_EVENTS_SYSTEM "user_events" > +#define USER_EVENTS_PREFIX "u:" > + > +/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */ > +#define EVENT_BIT_FTRACE 0 > +#define EVENT_BIT_PERF 1 > +#define EVENT_BIT_OTHER 7 > + > +#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE) > +#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF) > +#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER) > + > +/* Create dynamic location entry within a 32-bit value */ > +#define DYN_LOC(offset, size) ((size) << 16 | (offset)) > + > +/* Use raw iterator for attached BPF program(s), no affect on ftrace/perf */ > +#define FLAG_BPF_ITER (1 << 0) > + Can you add a description of the user_reg (and each field) here? > +struct user_reg { > + __u32 size; > + __u64 name_args; BTW, this field name is a bit strange. It is indeed "name and arguments", but actually, it is the definition of the event, isn't it? > + __u32 status_index; > + __u32 write_index; > +}; > + > +#define DIAG_IOC_MAGIC '*' > +#define DIAG_IOCSREG _IOWR(DIAG_IOC_MAGIC, 0, struct user_reg*) > +#define DIAG_IOCSDEL _IOW(DIAG_IOC_MAGIC, 1, char*) > + > +enum { > + USER_BPF_DATA_KERNEL, > + USER_BPF_DATA_USER, > + USER_BPF_DATA_ITER, > +}; > + > +struct user_bpf_iter { > + __u32 iov_offset; > + __u32 nr_segs; > + const struct iovec *iov; > +}; > + > +struct user_bpf_context { > + __u32 data_type; > + __u32 data_len; > + union { > + void *kdata; > + void *udata; > + struct user_bpf_iter *iter; > + }; > +}; Are those bpf related data structures passed from/to user? [...] > +/* > + * Parses a register command for user_events > + * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]] > + * > + * Example event named test with a 20 char msg field with a unsigned int after: Please quote the words in the example, like Example event named 'test' with a 20 char 'msg' field with an 'unsigned int id' after: (is that correct?) > + * test char[20] msg;unsigned int id > + * > + * NOTE: Offsets are from the user data perspective, they are not from the > + * trace_entry/buffer perspective. We automatically add the common properties > + * sizes to the offset for the user. > + */ > +static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) > +{ > + char *name = raw_command; > + char *args = strpbrk(name, " "); > + char *flags; > + > + if (args) > + *args++ = 0; > + > + flags = strpbrk(name, ":"); > + > + if (flags) > + *flags++ = 0; > + Just a nitpick. What about using strsep()? args = raw_command; flags = strsep(&args, " "); name = strsep(&flags, ":"); > + return user_event_parse(name, args, flags, newuser); > +} > + [...] > + > +static ssize_t user_status_read(struct file *file, char __user *ubuf, > + size_t count, loff_t *ppos) > +{ > + /* > + * Delay allocation of seq data until requested, most callers > + * will never read the status file. They will only mmap. > + */ I think you don't need to do this optimization since this is not a hot path. And it causes strange behaviors. See below; > + if (file->private_data == NULL) { > + int ret; > + > + if (*ppos != 0) > + return -EINVAL; > + > + ret = single_open(file, user_status_show, NULL); > + > + if (ret) > + return ret; This seems strange returning failure of open(2) from read(2). > + } > + > + return seq_read(file, ubuf, count, ppos); > +} > + > +static loff_t user_status_seek(struct file *file, loff_t offset, int whence) > +{ > + if (file->private_data == NULL) For example, this means unless start reading we can not do seek. So, please make the code as usually that is, unless any special reason. > + return 0; > + > + return seq_lseek(file, offset, whence); > +} > + > +static int user_status_release(struct inode *node, struct file *file) > +{ > + if (file->private_data == NULL) > + return 0; > + > + return single_release(node, file); > +} > + > +static const struct file_operations user_status_fops = { > + .mmap = user_status_mmap, > + .read = user_status_read, > + .llseek = user_status_seek, > + .release = user_status_release, > +}; Thank you, -- Masami Hiramatsu <mhiramat@xxxxxxxxxx>