This patch adds a PERF_COUNT_SW_USERSPACE_EVENT type, which can be generated by user with PERF_EVENT_IOC_ENTRY ioctl command, which injects an event of said type into the perf buffer. The ioctl takes a pointer to struct perf_event_userspace as an argument. The structure begins with a 64-bit integer type value, which determines meaning of the following content (size/data pair). Type 0 are defined as zero-terminated strings, other types are defined by userspace (the perf tool will contain a list of known values with reference implementation of data content parsers). Possible use cases for this feature: - "perf_printf" like mechanism to add logging messages to one's perf session; an example implementation: int perf_printf(int perf_fd, const char *fmt, ...) { struct perf_event_userspace *event; int size; va_list ap; int err; va_start(ap, fmt); size = vsnprintf(NULL, 0, fmt, ap) + 1; event = malloc(sizeof(*event) + size); if (!event) { va_end(ap); return -1; } event->type = 0; event->size = size; vsnprintf(event->data, size, fmt, ap); va_end(ap); err = ioctl(perf_fd, PERF_EVENT_IOC_USERSPACE, event); free(event); return err < 0 ? err : size - 1; } - "perf_printf" used by for perf trace tool, where certain traced process' calls are intercepted (eg. using LD_PRELOAD) and treated as logging requests, with it output redirected into the perf buffer - synchronisation of performance data generated in user space with the perf stream coming from the kernel. For example, the marker can be inserted by a JIT engine after it generated portion of the code, but before the code is executed for the first time, allowing the post-processor to pick the correct debugging information. - other example is a system profiling tool taking data from other sources than just perf, which generates a marker at the beginning at at the end of the session (also possibly periodically during the session) to synchronise kernel timestamps with clock values obtained in userspace (gtod or raw_monotonic). Signed-off-by: Pawel Moll <pawel.moll@xxxxxxx> --- include/linux/perf_event.h | 8 +++++ include/uapi/linux/perf_event.h | 34 ++++++++++++++++++++- kernel/events/core.c | 68 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 28b73b2..d904d31 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -64,6 +64,12 @@ struct perf_raw_record { void *data; }; +struct perf_userspace_entry { + u32 type; + u32 size; + u8 data[0]; +}; + /* * branch stack layout: * nr: number of taken branches stored in entries[] @@ -604,6 +610,8 @@ struct perf_sample_data { u64 txn; /* Raw monotonic timestamp, for userspace time correlation */ u64 clock_raw_monotonic; + /* Userspace-originating event */ + struct perf_userspace_entry *user_entry; }; static inline void perf_sample_data_init(struct perf_sample_data *data, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e5a75c5..37604ae 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -110,6 +110,7 @@ enum perf_sw_ids { PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, PERF_COUNT_SW_EMULATION_FAULTS = 8, PERF_COUNT_SW_DUMMY = 9, + PERF_COUNT_SW_USERSPACE_EVENT = 10, PERF_COUNT_SW_MAX, /* non-ABI */ }; @@ -138,8 +139,9 @@ enum perf_event_sample_format { PERF_SAMPLE_IDENTIFIER = 1U << 16, PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_CLOCK_RAW_MONOTONIC = 1U << 18, + PERF_SAMPLE_USERSPACE_EVENT = 1U << 19, - PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ }; /* @@ -337,6 +339,15 @@ struct perf_event_attr { __u32 __reserved_2; }; +/* + * Userspace-originating event to be generated with PERF_EVENT_IOC_USERSPACE + */ +struct perf_event_userspace { + __u32 type; + __u32 size; + __u8 data[0]; +}; + #define perf_flags(attr) (*(&(attr)->read_format + 1)) /* @@ -350,6 +361,8 @@ struct perf_event_attr { #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_USERSPACE _IOR('$', 8, \ + struct perf_event_userspace *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, @@ -688,6 +701,25 @@ enum perf_event_type { * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 clock_raw_monotonic; } && PERF_SAMPLE_CLOCK_RAW_MONOTONIC + * + * # + * # Contents of USERSPACE_EVENT sample data depend on its type. + * # + * # Type 0 means that the data is a zero-terminated string that + * # can be printf-ed in the normal way. + * # + * # Meaning of other type values depends on the userspace + * # and the perf tool code contains a list of those with + * # reference implementations of parsers. + * # + * # Overall size of the sample (including type and size fields) + * # is always aligned to 8 bytes by adding padding after + * # the data. + * # + * { u32 type; + * u32 size; + * char data[size]; + * char __padding[] } && PERF_SAMPLE_USERSPACE_EVENT * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index f6df547..11bf1be 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3655,6 +3655,8 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); +static int perf_sw_userspace_entry(struct perf_event *event, + struct perf_event_userspace __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -3709,6 +3711,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); + case PERF_EVENT_IOC_USERSPACE: + return perf_sw_userspace_entry(event, + (struct perf_event_userspace __user *)arg); + default: return -ENOTTY; } @@ -3728,6 +3734,7 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, switch (_IOC_NR(cmd)) { case _IOC_NR(PERF_EVENT_IOC_SET_FILTER): case _IOC_NR(PERF_EVENT_IOC_ID): + case _IOC_NR(PERF_EVENT_IOC_USERSPACE): /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */ if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) { cmd &= ~IOCSIZE_MASK; @@ -4727,6 +4734,16 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC) perf_output_put(handle, data->clock_raw_monotonic); + if (sample_type & PERF_SAMPLE_USERSPACE_EVENT) { + int size = data->user_entry->size; + int padding = ALIGN(size, sizeof(u64)) - size; + + perf_output_put(handle, data->user_entry->type); + perf_output_put(handle, size); + __output_copy(handle, data->user_entry->data, size); + perf_output_skip(handle, padding); + }; + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -4834,6 +4851,24 @@ void perf_prepare_sample(struct perf_event_header *header, data->stack_user_size = stack_size; header->size += size; } + + if (sample_type & PERF_SAMPLE_USERSPACE_EVENT) { + int size = data->user_entry->size; + + /* + * Type 0 means zero-terminated string; + * make sure it is terminated + */ + if (!data->user_entry->type) + data->user_entry->data[size - 1] = '\0'; + + /* + * The sample consist of 'type' and 'size' u32 fields + * followed with data and padding aligning it to 8 bytes. + */ + header->size += sizeof(u32) + sizeof(u32) + + ALIGN(size, sizeof(u64)); + } } static void perf_event_output(struct perf_event *event, @@ -5961,6 +5996,39 @@ static struct pmu perf_swevent = { .event_idx = perf_swevent_event_idx, }; +static int perf_sw_userspace_entry(struct perf_event *event, + struct perf_event_userspace __user *arg) +{ + u32 size; + struct perf_sample_data data; + struct pt_regs *regs = current_pt_regs(); + struct perf_userspace_entry *entry; + + if (!arg) + return -EINVAL; + + if (!static_key_false(&perf_swevent_enabled[ + PERF_COUNT_SW_USERSPACE_EVENT])) + return 0; + + BUILD_BUG_ON(sizeof(size) != sizeof(arg->size)); + if (copy_from_user(&size, &arg->size, sizeof(size)) != 0) + return -EFAULT; + + BUILD_BUG_ON(sizeof(*arg) != sizeof(*entry)); + entry = memdup_user(arg, sizeof(*arg) + size); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + perf_sample_data_init(&data, 0, 0); + data.user_entry = entry; + perf_event_output(event, &data, regs); + + kfree(entry); + + return 0; +} + #ifdef CONFIG_EVENT_TRACING static int perf_tp_filter_match(struct perf_event *event, -- 1.9.1 -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html