[PATCH 2/2] perf: Userspace software event and ioctl

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch adds a PERF_COUNT_SW_USERSPACE_EVENT type,
which can be generated by user with PERF_EVENT_IOC_ENTRY
ioctl command, which injects an event of said type into
the perf buffer.

The ioctl takes a pointer to struct perf_event_userspace
as an argument. The structure begins with a 64-bit
integer type value, which determines meaning of the
following content (size/data pair). Type 0 are defined
as zero-terminated strings, other types are defined by
userspace (the perf tool will contain a list of
known values with reference implementation of data
content parsers).

Possible use cases for this feature:

- "perf_printf" like mechanism to add logging messages
  to one's perf session; an example implementation:

	int perf_printf(int perf_fd, const char *fmt, ...)
	{
	        struct perf_event_userspace *event;
	        int size;
	        va_list ap;
	        int err;

	        va_start(ap, fmt);

	        size = vsnprintf(NULL, 0, fmt, ap) + 1;
	        event = malloc(sizeof(*event) + size);
	        if (!event) {
	                va_end(ap);
	                return -1;
	        }

	        event->type = 0;
	        event->size = size;
	        vsnprintf(event->data, size, fmt, ap);

	        va_end(ap);

	        err = ioctl(perf_fd, PERF_EVENT_IOC_USERSPACE, event);

	        free(event);

	        return err < 0 ? err : size - 1;
	}

- "perf_printf" used by for perf trace tool,
  where certain traced process' calls are intercepted
  (eg. using LD_PRELOAD) and treated as logging
  requests, with it output redirected into the
  perf buffer

- synchronisation of performance data generated in
  user space with the perf stream coming from the kernel.
  For example, the marker can be inserted by a JIT engine
  after it generated portion of the code, but before the
  code is executed for the first time, allowing the
  post-processor to pick the correct debugging
  information.

- other example is a system profiling tool taking data
  from other sources than just perf, which generates a marker
  at the beginning at at the end of the session
  (also possibly periodically during the session) to
  synchronise kernel timestamps with clock values
  obtained in userspace (gtod or raw_monotonic).

Signed-off-by: Pawel Moll <pawel.moll@xxxxxxx>
---
 include/linux/perf_event.h      |  8 +++++
 include/uapi/linux/perf_event.h | 34 ++++++++++++++++++++-
 kernel/events/core.c            | 68 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 28b73b2..d904d31 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -64,6 +64,12 @@ struct perf_raw_record {
 	void				*data;
 };
 
+struct perf_userspace_entry {
+	u32				type;
+	u32				size;
+	u8				data[0];
+};
+
 /*
  * branch stack layout:
  *  nr: number of taken branches stored in entries[]
@@ -604,6 +610,8 @@ struct perf_sample_data {
 	u64				txn;
 	/* Raw monotonic timestamp, for userspace time correlation */
 	u64				clock_raw_monotonic;
+	/* Userspace-originating event */
+	struct perf_userspace_entry	*user_entry;
 };
 
 static inline void perf_sample_data_init(struct perf_sample_data *data,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e5a75c5..37604ae 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -110,6 +110,7 @@ enum perf_sw_ids {
 	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
 	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 	PERF_COUNT_SW_DUMMY			= 9,
+	PERF_COUNT_SW_USERSPACE_EVENT		= 10,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
@@ -138,8 +139,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
 	PERF_SAMPLE_TRANSACTION			= 1U << 17,
 	PERF_SAMPLE_CLOCK_RAW_MONOTONIC		= 1U << 18,
+	PERF_SAMPLE_USERSPACE_EVENT		= 1U << 19,
 
-	PERF_SAMPLE_MAX = 1U << 19,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
 };
 
 /*
@@ -337,6 +339,15 @@ struct perf_event_attr {
 	__u32	__reserved_2;
 };
 
+/*
+ * Userspace-originating event to be generated with PERF_EVENT_IOC_USERSPACE
+ */
+struct perf_event_userspace {
+	__u32	type;
+	__u32	size;
+	__u8	data[0];
+};
+
 #define perf_flags(attr)	(*(&(attr)->read_format + 1))
 
 /*
@@ -350,6 +361,8 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
+#define PERF_EVENT_IOC_USERSPACE	_IOR('$', 8, \
+						struct perf_event_userspace *)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
@@ -688,6 +701,25 @@ enum perf_event_type {
 	 *	{ u64			data_src; } && PERF_SAMPLE_DATA_SRC
 	 *	{ u64			transaction; } && PERF_SAMPLE_TRANSACTION
 	 *	{ u64			clock_raw_monotonic; } && PERF_SAMPLE_CLOCK_RAW_MONOTONIC
+	 *
+	 *	#
+	 *	# Contents of USERSPACE_EVENT sample data depend on its type.
+	 *	#
+	 *	# Type 0 means that the data is a zero-terminated string that
+	 *	# can be printf-ed in the normal way.
+	 *	#
+	 *	# Meaning of other type values depends on the userspace
+	 *	# and the perf tool code contains a list of those with
+	 *	# reference implementations of parsers.
+	 *	#
+	 *	# Overall size of the sample (including type and size fields)
+	 *	# is always aligned to 8 bytes by adding padding after
+	 *	# the data.
+	 *	#
+	 *	{ u32			type;
+	 *	  u32			size;
+	 *	  char			data[size];
+	 *	  char                  __padding[] } && PERF_SAMPLE_USERSPACE_EVENT
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f6df547..11bf1be 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3655,6 +3655,8 @@ static inline int perf_fget_light(int fd, struct fd *p)
 static int perf_event_set_output(struct perf_event *event,
 				 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+static int perf_sw_userspace_entry(struct perf_event *event,
+	       struct perf_event_userspace __user *arg);
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
@@ -3709,6 +3711,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_EVENT_IOC_SET_FILTER:
 		return perf_event_set_filter(event, (void __user *)arg);
 
+	case PERF_EVENT_IOC_USERSPACE:
+		return perf_sw_userspace_entry(event,
+				(struct perf_event_userspace __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -3728,6 +3734,7 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
 	switch (_IOC_NR(cmd)) {
 	case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
 	case _IOC_NR(PERF_EVENT_IOC_ID):
+	case _IOC_NR(PERF_EVENT_IOC_USERSPACE):
 		/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
 		if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
 			cmd &= ~IOCSIZE_MASK;
@@ -4727,6 +4734,16 @@ void perf_output_sample(struct perf_output_handle *handle,
 	if (sample_type & PERF_SAMPLE_CLOCK_RAW_MONOTONIC)
 		perf_output_put(handle, data->clock_raw_monotonic);
 
+	if (sample_type & PERF_SAMPLE_USERSPACE_EVENT) {
+		int size = data->user_entry->size;
+		int padding = ALIGN(size, sizeof(u64)) - size;
+
+		perf_output_put(handle, data->user_entry->type);
+		perf_output_put(handle, size);
+		__output_copy(handle, data->user_entry->data, size);
+		perf_output_skip(handle, padding);
+	};
+
 	if (!event->attr.watermark) {
 		int wakeup_events = event->attr.wakeup_events;
 
@@ -4834,6 +4851,24 @@ void perf_prepare_sample(struct perf_event_header *header,
 		data->stack_user_size = stack_size;
 		header->size += size;
 	}
+
+	if (sample_type & PERF_SAMPLE_USERSPACE_EVENT) {
+		int size = data->user_entry->size;
+
+		/*
+		 * Type 0 means zero-terminated string;
+		 * make sure it is terminated
+		 */
+		if (!data->user_entry->type)
+			data->user_entry->data[size - 1] = '\0';
+
+		/*
+		 * The sample consist of 'type' and 'size' u32 fields
+		 * followed with data and padding aligning it to 8 bytes.
+		 */
+		header->size += sizeof(u32) + sizeof(u32) +
+				ALIGN(size, sizeof(u64));
+	}
 }
 
 static void perf_event_output(struct perf_event *event,
@@ -5961,6 +5996,39 @@ static struct pmu perf_swevent = {
 	.event_idx	= perf_swevent_event_idx,
 };
 
+static int perf_sw_userspace_entry(struct perf_event *event,
+	       struct perf_event_userspace __user *arg)
+{
+	u32 size;
+	struct perf_sample_data data;
+	struct pt_regs *regs = current_pt_regs();
+	struct perf_userspace_entry *entry;
+
+	if (!arg)
+		return -EINVAL;
+
+	if (!static_key_false(&perf_swevent_enabled[
+				PERF_COUNT_SW_USERSPACE_EVENT]))
+		return 0;
+
+	BUILD_BUG_ON(sizeof(size) != sizeof(arg->size));
+	if (copy_from_user(&size, &arg->size, sizeof(size)) != 0)
+		return -EFAULT;
+
+	BUILD_BUG_ON(sizeof(*arg) != sizeof(*entry));
+	entry = memdup_user(arg, sizeof(*arg) + size);
+	if (IS_ERR(entry))
+		return PTR_ERR(entry);
+
+	perf_sample_data_init(&data, 0, 0);
+	data.user_entry = entry;
+	perf_event_output(event, &data, regs);
+
+	kfree(entry);
+
+	return 0;
+}
+
 #ifdef CONFIG_EVENT_TRACING
 
 static int perf_tp_filter_match(struct perf_event *event,
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux