Adds the required APIs to libtracefs to create, manage and write out data to trace events via the user_events kernel mechanism. Signed-off-by: Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> --- Makefile | 8 + include/tracefs-local.h | 24 ++ include/tracefs.h | 60 +++++ src/Makefile | 4 + src/tracefs-userevents.c | 545 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 641 insertions(+) create mode 100644 src/tracefs-userevents.c diff --git a/Makefile b/Makefile index 544684c..a4598b4 100644 --- a/Makefile +++ b/Makefile @@ -154,6 +154,14 @@ CFLAGS ?= -g -Wall CPPFLAGS ?= LDFLAGS ?= +USEREVENTS_INSTALLED := $(shell if (echo "$(pound)include <linux/user_events.h>" | $(CC) -E - >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi) +export USEREVENTS_INSTALLED +ifeq ($(USEREVENTS_INSTALLED), 1) +CFLAGS += -DUSEREVENTS +else +$(warning user_events.h not installed, skipping) +endif + CUNIT_INSTALLED := $(shell if (printf "$(pound)include <CUnit/Basic.h>\n void main(){CU_initialize_registry();}" | $(CC) -x c - -lcunit -o /dev/null >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi) export CUNIT_INSTALLED diff --git a/include/tracefs-local.h b/include/tracefs-local.h index bf157e1..e768cba 100644 --- a/include/tracefs-local.h +++ b/include/tracefs-local.h @@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep, struct tep_event *get_tep_event(struct tep_handle *tep, const char *system, const char *name); +/* Internal interface for ftrace user events */ + +struct tracefs_user_event_group; + +struct tracefs_user_event +{ + int write_index; + int status_index; + int iovecs; + int rels; + int len; + struct tracefs_user_event_group *group; + struct tracefs_user_event *next; +}; + +struct tracefs_user_event_group +{ + int fd; + int mmap_len; + char *mmap; + pthread_mutex_t lock; + struct tracefs_user_event *events; +}; + #endif /* _TRACE_FS_LOCAL_H */ diff --git a/include/tracefs.h b/include/tracefs.h index 1848ad0..7871dfe 100644 --- a/include/tracefs.h +++ b/include/tracefs.h @@ -571,4 +571,64 @@ struct tracefs_synth *tracefs_sql(struct tep_handle *tep, const char *name, struct tep_event * tracefs_synth_get_event(struct tep_handle *tep, struct tracefs_synth *synth); +/* User events */ +enum tracefs_uevent_type { + TRACEFS_UEVENT_END, + TRACEFS_UEVENT_u8, + TRACEFS_UEVENT_s8, + TRACEFS_UEVENT_u16, + TRACEFS_UEVENT_s16, + TRACEFS_UEVENT_u32, + TRACEFS_UEVENT_s32, + TRACEFS_UEVENT_u64, + TRACEFS_UEVENT_s64, + TRACEFS_UEVENT_string, + TRACEFS_UEVENT_struct, + TRACEFS_UEVENT_varray, + TRACEFS_UEVENT_vstring, +}; + +enum tracefs_uevent_flags { + /* None */ + TRACEFS_UEVENT_FLAG_NONE = 0, + + /* When BPF is attached, use iterator/no copy */ + TRACEFS_UEVENT_FLAG_bpf_iter = 1 << 0, +}; + +struct tracefs_uevent_item { + /* Type of item */ + enum tracefs_uevent_type type; + + /* Length of data, optional during register */ + int len; + + union { + /* Used during write */ + const void *data; + + /* Used during register */ + const char *name; + }; +}; + +struct tracefs_user_event; +struct tracefs_user_event_group; + +struct tracefs_user_event_group *tracefs_user_event_group_create(void); + +void tracefs_user_event_group_close(struct tracefs_user_event_group *group); + +int tracefs_user_event_delete(const char *name); + +struct tracefs_user_event * +tracefs_user_event_register(struct tracefs_user_event_group *group, + const char *name, enum tracefs_uevent_flags flags, + struct tracefs_uevent_item *items); + +bool tracefs_user_event_test(struct tracefs_user_event *event); + +int tracefs_user_event_write(struct tracefs_user_event *event, + struct tracefs_uevent_item *items); + #endif /* _TRACE_FS_H */ diff --git a/src/Makefile b/src/Makefile index e8afab5..984e8cf 100644 --- a/src/Makefile +++ b/src/Makefile @@ -14,6 +14,10 @@ OBJS += tracefs-filter.o OBJS += tracefs-dynevents.o OBJS += tracefs-eprobes.o +ifeq ($(USEREVENTS_INSTALLED), 1) +OBJS += tracefs-userevents.o +endif + # Order matters for the the three below OBJS += sqlhist-lex.o OBJS += sqlhist.tab.o diff --git a/src/tracefs-userevents.c b/src/tracefs-userevents.c new file mode 100644 index 0000000..4d64fd8 --- /dev/null +++ b/src/tracefs-userevents.c @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* + * Copyright (C) 2022 Microsoft Corporation. + * + * Authors: + * Beau Belgrave <beaub@xxxxxxxxxxxxxxxxxxx> + */ + +#include <alloca.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <linux/user_events.h> + +#include "tracefs.h" +#include "tracefs-local.h" + +#define STAT_FILE "user_events_status" +#define DATA_FILE "user_events_data" + +static void free_user_events(struct tracefs_user_event *event) +{ + struct tracefs_user_event *next; + + while (event) { + next = event->next; + free(event); + event = next; + } +} + +#define LEN_OR_ZERO (len ? len - pos : 0) +static int append_field(struct tracefs_uevent_item *item, char *buf, + int len, int offset, int index) +{ + int pos = offset; + + if (index != 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); + + switch (item->type) { + case TRACEFS_UEVENT_u8: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " u8 %s", item->name); + break; + + case TRACEFS_UEVENT_s8: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " s8 %s", item->name); + break; + + case TRACEFS_UEVENT_u16: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " u16 %s", item->name); + break; + + case TRACEFS_UEVENT_s16: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " s16 %s", item->name); + break; + + case TRACEFS_UEVENT_u32: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " u32 %s", item->name); + break; + + case TRACEFS_UEVENT_s32: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " s32 %s", item->name); + break; + + case TRACEFS_UEVENT_u64: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " u64 %s", item->name); + break; + + case TRACEFS_UEVENT_s64: + pos += snprintf(buf + pos, LEN_OR_ZERO, + " s64 %s", item->name); + break; + + case TRACEFS_UEVENT_string: + if (item->len <= 0) { + errno = EINVAL; + return -1; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, + " char[%d] %s", item->len, item->name); + break; + + case TRACEFS_UEVENT_struct: + /* + * struct must have 2 strings, do simple check + * in user, kernel will fully validate + */ + if (!strchr(item->name, ' ')) { + errno = EINVAL; + return -1; + } + + if (item->len <= 0) { + errno = EINVAL; + return -1; + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, + " struct %s %d", item->name, item->len); + break; + + case TRACEFS_UEVENT_varray: + /* Variable length array */ + pos += snprintf(buf + pos, LEN_OR_ZERO, + " __rel_loc u8[] %s", item->name); + break; + + case TRACEFS_UEVENT_vstring: + /* Variable length string */ + pos += snprintf(buf + pos, LEN_OR_ZERO, + " __rel_loc char[] %s", item->name); + break; + + default: + /* Unknown */ + errno = ENOENT; + return -1; + } + + return pos; +} + +static int create_reg_cmd(const char *name, enum tracefs_uevent_flags flags, + struct tracefs_uevent_item *items, char *buf, int len) +{ + int pos = 0; + int index = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", name); + + if (flags & TRACEFS_UEVENT_FLAG_bpf_iter) + pos += snprintf(buf + pos, LEN_OR_ZERO, ":BPF_ITER"); + + while (items->type != TRACEFS_UEVENT_END) { + pos = append_field(items, buf, len, pos, index++); + + if (pos < 0) + return pos; + + items++; + } + + return pos + 1; +} +#undef LEN_OR_ZERO + +static int get_write_counts(struct tracefs_user_event *event, + struct tracefs_uevent_item *item) +{ + event->rels = 0; + event->len = 0; + + /* Start at 1, need iovec for write_index */ + event->iovecs = 1; + + while (item->type != TRACEFS_UEVENT_END) { + switch (item->type) { + case TRACEFS_UEVENT_u8: + case TRACEFS_UEVENT_s8: + event->len += sizeof(__u8); + break; + + case TRACEFS_UEVENT_u16: + case TRACEFS_UEVENT_s16: + event->len += sizeof(__u16); + break; + + case TRACEFS_UEVENT_u32: + case TRACEFS_UEVENT_s32: + event->len += sizeof(__u32); + break; + + case TRACEFS_UEVENT_u64: + case TRACEFS_UEVENT_s64: + event->len += sizeof(__u64); + break; + + case TRACEFS_UEVENT_string: + case TRACEFS_UEVENT_struct: + event->len += item->len; + break; + + case TRACEFS_UEVENT_varray: + case TRACEFS_UEVENT_vstring: + /* Requires a rel loc entry */ + event->len += sizeof(__u32); + event->rels++; + break; + + default: + /* Unknown */ + errno = ENOENT; + return -1; + } + + event->iovecs++; + item++; + } + + return 0; +} + +/** + * tracefs_user_event_group_create - Create a new group to use for user events + * + * Returns a pointer to a group to use for user events. The pointer is valid until + * tracefs_user_event_group_close() is called. In case of an error NULL is + * returned. + */ +struct tracefs_user_event_group *tracefs_user_event_group_create(void) +{ + int stat, write, page_size, i; + struct tracefs_user_event_group *group; + + stat = tracefs_instance_file_open(NULL, STAT_FILE, O_RDWR); + + if (stat < 0) + return NULL; + + write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR); + + if (write < 0) + goto put_stat; + + group = malloc(sizeof(*group)); + + if (!group) + goto put_write; + + if (pthread_mutex_init(&group->lock, NULL) < 0) + goto put_group; + + /* Scale up to 16-bit max user events a page at a time */ + page_size = sysconf(_SC_PAGESIZE); + group->mmap_len = page_size; + + for (i = 0; i < 16; ++i) { + group->mmap = mmap(NULL, group->mmap_len, + PROT_READ, MAP_SHARED, stat, 0); + + if (group->mmap == MAP_FAILED && errno == EINVAL) { + /* Increase by page size and try again */ + group->mmap_len += page_size; + continue; + } + + break; + } + + if (group->mmap == MAP_FAILED) + goto put_group; + + group->fd = write; + group->events = NULL; + + /* Status fd no longer needed */ + close(stat); + + return group; + +put_group: + free(group); +put_write: + close(write); +put_stat: + close(stat); + + return NULL; +} + +/** + * tracefs_user_event_delete - Deletes a user event from the system + * @name: Name of the event to delete + * + * Deletes the event from the system if it is not used. + */ +int tracefs_user_event_delete(const char *name) +{ + int ret, write; + + write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR); + + if (write < 0) + return write; + + ret = ioctl(write, DIAG_IOCSDEL, name); + + close(write); + + return ret; +} + +/** + * tracefs_user_event_group_close - Closes a group containing user events + * @group: Group to close + * + * Closes a group and all the user events within it. Any user event that has + * been added to the group is no longer valid and cannot be used. + */ +void tracefs_user_event_group_close(struct tracefs_user_event_group *group) +{ + if (!group) + return; + + if (group->mmap != MAP_FAILED) + munmap(group->mmap, group->mmap_len); + + if (group->fd != -1) + close(group->fd); + + free_user_events(group->events); + free(group); +} + +/** + * tracefs_user_event_register - Registers a user event with the system + * @group: Group to add the user event to + * @name: Name of the event to register + * @flags: Flags to use + * @items: Array of items that the event contains + * + * Allocates and registers a user event with the system. The user event will be + * added to the @group. The lifetime of the event is bound to the @group. When + * the @group is closed via tracefs_user_event_group_close() the event will no + * longer exist and should not be used. + * + * The @items are processed in order and the final item type must be set to + * TRACEFS_UEVENT_END to mark the last item. Each item must have the type + * and name defined. The string and struct type also require the len to be set + * for the item. + * + * Return a pointer to a user event on success, or NULL or error. + * + * errno will be set to EINVAL if @group is null or unexpected @items. + */ +struct tracefs_user_event * +tracefs_user_event_register(struct tracefs_user_event_group *group, + const char *name, enum tracefs_uevent_flags flags, + struct tracefs_uevent_item *items) +{ + struct tracefs_user_event *event = NULL; + struct user_reg reg = {0}; + char *cmd = NULL; + int len; + + if (!group || !items) { + errno = EINVAL; + return NULL; + } + + /* Determine length of cmd */ + len = create_reg_cmd(name, flags, items, cmd, 0); + + if (len < 0) { + errno = EINVAL; + return NULL; + } + + /* Allocate and fill cmd */ + cmd = malloc(len); + + if (!cmd) + return NULL; + + create_reg_cmd(name, flags, items, cmd, len); + + event = malloc(sizeof(*event)); + + if (!event) + goto put_cmd; + + reg.size = sizeof(reg); + reg.name_args = (__u64)cmd; + + /* Register event with kernel */ + if (ioctl(group->fd, DIAG_IOCSREG, ®) == -1) + goto put_event; + + /* Sanity check bounds returned */ + if (reg.status_index >= group->mmap_len) { + errno = EINVAL; + goto put_event; + } + + if (get_write_counts(event, items)) + goto put_event; + + event->write_index = reg.write_index; + event->status_index = reg.status_index; + event->group = group; + + /* Add event into the group under lock */ + pthread_mutex_lock(&group->lock); + event->next = group->events; + group->events = event->next; + pthread_mutex_unlock(&group->lock); + + free(cmd); + + return event; +put_event: + free(event); +put_cmd: + free(cmd); + + return NULL; +} + +/** + * tracefs_user_event_test - Tests if an event is currently enabled + * @event: User event to test + * + * Tests if the @event is valid and currently enabled on the system. + * + * Return true if enabled, false otherwise. + */ +bool tracefs_user_event_test(struct tracefs_user_event *event) +{ + return event && event->group->mmap[event->status_index] != 0; +} + +/** + * tracefs_user_event_write - Writes data out to an event + * @event: User event to write data about + * @items: Items to write for the event + * + * Writes out items for the event. Callers should check if the cost of writing + * should be performed by calling tracefs_user_event_test(). Items are checked + * to ensure they fit within the described items during register. Each item + * must specify the length of the item being written. + * + * Return the number of bytes written or -1 upon error. + * + * errno will be set to EINVAL if @event or @items is null or @items contains + * an item with a length of less than or equal to 0. + * errno will be set to E2BIG if @items contains more items than previously + * registered for the event. + */ +int tracefs_user_event_write(struct tracefs_user_event *event, + struct tracefs_uevent_item *items) +{ + struct iovec *head, *io, *relio, *io_end; + __u32 *rel, *rel_end; + int len, rel_offset, data_offset, used; + + if (!event || !items) { + errno = EINVAL; + return -1; + } + + head = io = alloca(sizeof(*io) * (event->iovecs + event->rels)); + rel = alloca(sizeof(*rel) * event->rels); + + io_end = head + (event->iovecs + event->rels); + rel_end = rel + event->rels; + + /* Relative offset starts at end of static data */ + relio = io + event->iovecs; + rel_offset = event->len; + data_offset = 0; + + /* Write index must be first */ + io->iov_base = &event->write_index; + io->iov_len = sizeof(event->write_index); + io++; + used = 1; + + while (items->type != TRACEFS_UEVENT_END) { + len = items->len; + + if (len <= 0) + goto bad_length; + + if (io >= io_end) + goto bad_count; + + switch (items->type) { + case TRACEFS_UEVENT_varray: + case TRACEFS_UEVENT_vstring: + /* Dual vectors */ + used += 2; + + if (rel >= rel_end || relio >= io_end) + goto bad_count; + + /* __rel_loc types */ + relio->iov_base = (void *)items->data; + relio->iov_len = len; + relio++; + + io->iov_base = (void *)rel; + io->iov_len = sizeof(*rel); + io++; + rel_offset -= sizeof(*rel); + + /* Fill in rel loc data */ + *rel = DYN_LOC(rel_offset + data_offset, len); + data_offset += len; + rel++; + + break; + + default: + /* Single vector */ + used++; + + /* Direct types */ + io->iov_base = (void *)items->data; + io->iov_len = len; + io++; + rel_offset -= len; + + break; + } + + items++; + } + + return writev(event->group->fd, head, used); + +bad_length: + fprintf(stderr, "Bad user_event item length at index %d\n", + used - 1); + errno = EINVAL; + return -1; + +bad_count: + fprintf(stderr, "Too many user_event items passed\n"); + errno = E2BIG; + return -1; +} -- 2.17.1