[PATCH 3/8] drm/i915: Framework for capturing command stream based OA reports

sourab.gupta@xxxxxxxxx · Thu, 16 Mar 2017 11:44:10 +0530

From: Sourab Gupta <sourab.gupta@xxxxxxxxx>

This patch introduces a framework to capture OA counter reports associated
with Render command stream. We can then associate the reports captured
through this mechanism with their corresponding context id's. This can be
further extended to associate any other metadata information with the
corresponding samples (since the association with Render command stream
gives us the ability to capture these information while inserting the
corresponding capture commands into the command stream).

The OA reports generated in this way are associated with a corresponding
workload, and thus can be used the delimit the workload (i.e. sample the
counters at the workload boundaries), within an ongoing stream of periodic
counter snapshots.

There may be usecases wherein we need more than periodic OA capture mode
which is supported currently. This mode is primarily used for two usecases:
    - Ability to capture system wide metrics, alongwith the ability to map
      the reports back to individual contexts (particularly for HSW).
    - Ability to inject tags for work, into the reports. This provides
      visibility into the multiple stages of work within single context.

The userspace will be able to distinguish between the periodic and CS based
OA reports by the virtue of source_info sample field.

The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA
counters, and is inserted at BB boundaries.
The data thus captured will be stored in a separate buffer, which will
be different from the buffer used otherwise for periodic OA capture mode.
The metadata information pertaining to snapshot is maintained in a list,
which also has offsets into the gem buffer object per captured snapshot.
In order to track whether the gpu has completed processing the node,
a field pertaining to corresponding gem request is added, which is tracked
for completion of the command.

Both periodic and RCS based reports are associated with a single stream
(corresponding to render engine), and it is expected to have the samples
in the sequential order according to their timestamps. Now, since these
reports are collected in separate buffers, these are merge sorted at the
time of forwarding to userspace during the read call.

v2: Aligining with the non-perf interface (custom drm ioctl based). Also,
few related patches are squashed together for better readability

Signed-off-by: Sourab Gupta <sourab.gupta@xxxxxxxxx>
Signed-off-by: Robert Bragg <robert@xxxxxxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_drv.h            |  88 ++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   4 +
 drivers/gpu/drm/i915/i915_perf.c           | 888 ++++++++++++++++++++++++-----
 include/uapi/drm/i915_drm.h                |  15 +
 4 files changed, 861 insertions(+), 134 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ef104ff5..0b70052 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1972,6 +1972,23 @@ struct i915_perf_stream_ops {
 	 * The stream will always be disabled before this is called.
 	 */
 	void (*destroy)(struct i915_perf_stream *stream);
+
+	/*
+	 * @command_stream_hook: Emit the commands in the command streamer
+	 * for a particular gpu engine.
+	 *
+	 * The commands are inserted to capture the perf sample data at
+	 * specific points during workload execution, such as before and after
+	 * the batch buffer.
+	 */
+	void (*command_stream_hook)(struct i915_perf_stream *stream,
+				struct drm_i915_gem_request *request);
+};
+
+enum i915_perf_stream_state {
+	I915_PERF_STREAM_DISABLED,
+	I915_PERF_STREAM_ENABLE_IN_PROGRESS,
+	I915_PERF_STREAM_ENABLED,
 };
 
 /**
@@ -1989,6 +2006,10 @@ struct i915_perf_stream {
 	struct list_head link;
 
 	/**
+	 * @engine: GPU engine associated with this particular stream
+	 */
+	enum intel_engine_id engine;
+	/**
 	 * @sample_flags: Flags representing the `DRM_I915_PERF_PROP_SAMPLE_*`
 	 * properties given when opening a stream, representing the contents
 	 * of a single sample as read() by userspace.
@@ -2009,11 +2030,25 @@ struct i915_perf_stream {
 	struct i915_gem_context *ctx;
 
 	/**
-	 * @enabled: Whether the stream is currently enabled, considering
-	 * whether the stream was opened in a disabled state and based
-	 * on `I915_PERF_IOCTL_ENABLE` and `I915_PERF_IOCTL_DISABLE` calls.
+	 * @state: Current stream state, which can be either disabled, enabled,
+	 * or enable_in_progress, while considering whether the stream was
+	 * opened in a disabled state and based on `I915_PERF_IOCTL_ENABLE` and
+	 * `I915_PERF_IOCTL_DISABLE` calls.
 	 */
-	bool enabled;
+	enum i915_perf_stream_state state;
+
+	/**
+	 * @cs_mode: Whether command stream based perf sample collection is
+	 * enabled for this stream
+	 */
+	bool cs_mode;
+
+	/**
+	 * @last_request: Contains an RCU guarded pointer to the last request
+	 * associated with the stream, when command stream mode is enabled for
+	 * the stream.
+	 */
+	struct i915_gem_active last_request;
 
 	/**
 	 * @ops: The callbacks providing the implementation of this specific
@@ -2074,7 +2109,8 @@ struct i915_oa_ops {
 	int (*read)(struct i915_perf_stream *stream,
 		    char __user *buf,
 		    size_t count,
-		    size_t *offset);
+		    size_t *offset,
+		    u32 ts);
 
 	/**
 	 * @oa_buffer_check: Check for OA buffer data + update tail
@@ -2093,6 +2129,36 @@ struct i915_oa_ops {
 	bool (*oa_buffer_check)(struct drm_i915_private *dev_priv);
 };
 
+/*
+ * i915_perf_cs_sample - Sample element to hold info about a single perf
+ * sample data associated with a particular GPU command stream.
+ */
+struct i915_perf_cs_sample {
+	/**
+	 * @link: Links the sample into ``&drm_i915_private->perf.cs_samples``
+	 */
+	struct list_head link;
+
+	/**
+	 * @request: Gem request associated with the sample. The commands to
+	 * capture the perf metrics are inserted into the command streamer in
+	 * context of this request.
+	 */
+	struct drm_i915_gem_request *request;
+
+	/**
+	 * @offset: Offset into ``&drm_i915_private->perf.command_stream_buf``
+	 * where the perf metrics will be collected, when the commands inserted
+	 * into the command stream are executed by GPU.
+	 */
+	u32 offset;
+
+	/**
+	 * @ctx_id: Context ID associated with this perf sample
+	 */
+	u32 ctx_id;
+};
+
 struct intel_cdclk_state {
 	unsigned int cdclk, vco, ref;
 };
@@ -2421,6 +2487,8 @@ struct drm_i915_private {
 		struct ctl_table_header *sysctl_header;
 
 		struct mutex lock;
+
+		struct mutex streams_lock;
 		struct list_head streams;
 
 		spinlock_t hook_lock;
@@ -2532,6 +2600,15 @@ struct drm_i915_private {
 			const struct i915_oa_format *oa_formats;
 			int n_builtin_sets;
 		} oa;
+
+		/* Command stream based perf data buffer */
+		struct {
+			struct i915_vma *vma;
+			u8 *vaddr;
+		} command_stream_buf;
+
+		struct list_head cs_samples;
+		spinlock_t sample_lock;
 	} perf;
 
 	/* Abstract the submission mechanism (legacy ringbuffer or execlists) away */
@@ -3593,6 +3670,7 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine,
 void i915_oa_update_reg_state(struct intel_engine_cs *engine,
 			      struct i915_gem_context *ctx,
 			      uint32_t *reg_state);
+void i915_perf_command_stream_hook(struct drm_i915_gem_request *req);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index aa75ea2..7af32c97 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1441,12 +1441,16 @@ static void eb_export_fence(struct drm_i915_gem_object *obj,
 	if (exec_len == 0)
 		exec_len = params->batch->size - params->args_batch_start_offset;
 
+	i915_perf_command_stream_hook(params->request);
+
 	ret = params->engine->emit_bb_start(params->request,
 					    exec_start, exec_len,
 					    params->dispatch_flags);
 	if (ret)
 		return ret;
 
+	i915_perf_command_stream_hook(params->request);
+
 	i915_gem_execbuffer_move_to_active(vmas, params->request);
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 540c5b2..3321dad 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -285,6 +285,12 @@
 #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
 #define OAREPORT_REASON_CLK_RATIO      (1<<5)
 
+/* Data common to periodic and RCS based OA samples */
+struct oa_sample_data {
+	u32 source;
+	u32 ctx_id;
+	const u8 *report;
+};
 
 /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
  *
@@ -323,8 +329,19 @@
 	[I915_OA_FORMAT_C4_B8]		    = { 7, 64 },
 };
 
+/* Duplicated from similar static enum in i915_gem_execbuffer.c */
+#define I915_USER_RINGS (4)
+static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
+	[I915_EXEC_DEFAULT]	= RCS,
+	[I915_EXEC_RENDER]	= RCS,
+	[I915_EXEC_BLT]		= BCS,
+	[I915_EXEC_BSD]		= VCS,
+	[I915_EXEC_VEBOX]	= VECS
+};
+
 #define SAMPLE_OA_REPORT      (1<<0)
 #define SAMPLE_OA_SOURCE_INFO	(1<<1)
+#define SAMPLE_CTX_ID		(1<<2)
 
 /**
  * struct perf_open_properties - for validated properties given to open a stream
@@ -335,6 +352,9 @@
  * @oa_format: An OA unit HW report format
  * @oa_periodic: Whether to enable periodic OA unit sampling
  * @oa_period_exponent: The OA unit sampling period is derived from this
+ * @cs_mode: Whether the stream is configured to enable collection of metrics
+ * associated with command stream of a particular GPU engine
+ * @engine: The GPU engine associated with the stream in case cs_mode is enabled
  *
  * As read_properties_unlocked() enumerates and validates the properties given
  * to open a stream of metrics the configuration is built up in the structure
@@ -351,9 +371,218 @@ struct perf_open_properties {
 	int oa_format;
 	bool oa_periodic;
 	int oa_period_exponent;
+
+	/* Command stream mode */
+	bool cs_mode;
+	enum intel_engine_id engine;
 };
 
 /**
+ * i915_perf_command_stream_hook - Insert the commands to capture metrics into the
+ * command stream of a GPU engine.
+ * @request: request in whose context the metrics are being collected.
+ *
+ * The function provides a hook through which the commands to capture perf
+ * metrics, are inserted into the command stream of a GPU engine.
+ */
+void i915_perf_command_stream_hook(struct drm_i915_gem_request *request)
+{
+	struct intel_engine_cs *engine = request->engine;
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct i915_perf_stream *stream;
+
+	if (!dev_priv->perf.initialized)
+		return;
+
+	mutex_lock(&dev_priv->perf.streams_lock);
+	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
+		if ((stream->state == I915_PERF_STREAM_ENABLED) &&
+					stream->cs_mode)
+			stream->ops->command_stream_hook(stream, request);
+	}
+	mutex_unlock(&dev_priv->perf.streams_lock);
+}
+
+/**
+ * release_perf_samples - Release old perf samples to make space for new
+ * sample data.
+ * @dev_priv: i915 device private
+ * @target_size: Space required to be freed up.
+ *
+ * We also dereference the associated request before deleting the sample.
+ * Also, no need to check whether the commands associated with old samples
+ * have been completed. This is because these sample entries are anyways going
+ * to be replaced by a new sample, and gpu will eventually overwrite the buffer
+ * contents, when the request associated with new sample completes.
+ */
+static void release_perf_samples(struct drm_i915_private *dev_priv,
+					u32 target_size)
+{
+	struct i915_perf_cs_sample *sample, *next;
+	u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size;
+	u32 size = 0;
+
+	list_for_each_entry_safe
+		(sample, next, &dev_priv->perf.cs_samples, link) {
+
+		size += sample_size;
+		i915_gem_request_put(sample->request);
+		list_del(&sample->link);
+		kfree(sample);
+
+		if (size >= target_size)
+			break;
+	}
+}
+
+/**
+ * insert_perf_sample - Insert a perf sample entry to the sample list.
+ * @dev_priv: i915 device private
+ * @sample: perf CS sample to be inserted into the list
+ *
+ * This function never fails, since it always manages to insert the sample.
+ * If the space is exhausted in the buffer, it will remove the older
+ * entries in order to make space.
+ */
+static void insert_perf_sample(struct drm_i915_private *dev_priv,
+				struct i915_perf_cs_sample *sample)
+{
+	struct i915_perf_cs_sample *first, *last;
+	int max_offset = dev_priv->perf.command_stream_buf.vma->obj->base.size;
+	u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size;
+
+	spin_lock(&dev_priv->perf.sample_lock);
+	if (list_empty(&dev_priv->perf.cs_samples)) {
+		sample->offset = 0;
+		list_add_tail(&sample->link, &dev_priv->perf.cs_samples);
+		spin_unlock(&dev_priv->perf.sample_lock);
+		return;
+	}
+
+	first = list_first_entry(&dev_priv->perf.cs_samples,typeof(*first),
+				link);
+	last = list_last_entry(&dev_priv->perf.cs_samples, typeof(*last),
+				link);
+
+	if (last->offset >= first->offset) {
+		/* Sufficient space available at the end of buffer? */
+		if (last->offset + 2*sample_size < max_offset)
+			sample->offset = last->offset + sample_size;
+		/*
+		 * Wraparound condition. Is sufficient space available at
+		 * beginning of buffer?
+		 */
+		else if (sample_size < first->offset)
+			sample->offset = 0;
+		/* Insufficient space. Overwrite existing old entries */
+		else {
+			u32 target_size = sample_size - first->offset;
+
+			release_perf_samples(dev_priv, target_size);
+			sample->offset = 0;
+		}
+	} else {
+		/* Sufficient space available? */
+		if (last->offset + 2*sample_size < first->offset)
+			sample->offset = last->offset + sample_size;
+		/* Insufficient space. Overwrite existing old entries */
+		else {
+			u32 target_size = sample_size -
+				(first->offset - last->offset -
+				sample_size);
+
+			release_perf_samples(dev_priv, target_size);
+			sample->offset = last->offset + sample_size;
+		}
+	}
+	list_add_tail(&sample->link, &dev_priv->perf.cs_samples);
+	spin_unlock(&dev_priv->perf.sample_lock);
+}
+
+/**
+ * i915_perf_command_stream_hook_oa - Insert the commands to capture OA reports
+ * metrics into the render command stream
+ * @stream: An i915-perf stream opened for OA metrics
+ * @request: request in whose context the metrics are being collected.
+ *
+ */
+static void i915_perf_command_stream_hook_oa(struct i915_perf_stream *stream,
+					struct drm_i915_gem_request *request)
+{
+	struct drm_i915_private *dev_priv = request->i915;
+	struct i915_gem_context *ctx = request->ctx;
+	struct i915_perf_cs_sample *sample;
+	u32 addr = 0;
+	u32 cmd, *cs;
+
+	sample = kzalloc(sizeof(*sample), GFP_KERNEL);
+	if (sample == NULL) {
+		DRM_ERROR("Perf sample alloc failed\n");
+		return;
+	}
+
+	cs = intel_ring_begin(request, 4);
+	if (IS_ERR(cs)) {
+		kfree(sample);
+		return;
+	}
+
+	sample->ctx_id = ctx->hw_id;
+	i915_gem_request_assign(&sample->request, request);
+
+	insert_perf_sample(dev_priv, sample);
+
+	addr = dev_priv->perf.command_stream_buf.vma->node.start +
+		sample->offset;
+
+	if (WARN_ON(addr & 0x3f)) {
+		DRM_ERROR("OA buffer address not aligned to 64 byte");
+		return;
+	}
+
+	cmd = MI_REPORT_PERF_COUNT | (1<<0);
+	if (INTEL_GEN(dev_priv) >= 8)
+		cmd |= (2<<0);
+
+	*cs++ = cmd;
+	*cs++ = addr | MI_REPORT_PERF_COUNT_GGTT;
+	*cs++ = request->global_seqno;
+
+	if (INTEL_GEN(dev_priv) >= 8)
+		*cs++ = 0;
+	else
+		*cs++ = MI_NOOP;
+
+	intel_ring_advance(request, cs);
+
+	i915_gem_active_set(&stream->last_request, request);
+	i915_vma_move_to_active(dev_priv->perf.command_stream_buf.vma, request,
+					EXEC_OBJECT_WRITE);
+}
+
+/**
+ * i915_oa_rcs_release_samples - Release the perf command stream samples
+ * @dev_priv: i915 device private
+ *
+ * Note: The associated requests should be completed before releasing the
+ * references here.
+ */
+static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv)
+{
+	struct i915_perf_cs_sample *entry, *next;
+
+	list_for_each_entry_safe
+		(entry, next, &dev_priv->perf.cs_samples, link) {
+		i915_gem_request_put(entry->request);
+
+		spin_lock(&dev_priv->perf.sample_lock);
+		list_del(&entry->link);
+		spin_unlock(&dev_priv->perf.sample_lock);
+		kfree(entry);
+	}
+}
+
+/**
  * gen8_oa_buffer_check_unlocked - check for data and update tail ptr state
  * @dev_priv: i915 device instance
  *
@@ -626,7 +855,8 @@ static int append_oa_status(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
- * @report: A single OA report to (optionally) include as part of the sample
+ * @data: OA sample data, which contains (optionally) OA report with other
+ * sample metadata.
  *
  * The contents of a sample are configured through `DRM_I915_PERF_PROP_SAMPLE_*`
  * properties when opening a stream, tracked as `stream->sample_flags`. This
@@ -641,7 +871,7 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 			    char __user *buf,
 			    size_t count,
 			    size_t *offset,
-			    const u8 *report)
+			    const struct oa_sample_data *data)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -661,17 +891,21 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 	buf += sizeof(header);
 
 	if (sample_flags & SAMPLE_OA_SOURCE_INFO) {
-		enum drm_i915_perf_oa_event_source source =
-				I915_PERF_OA_EVENT_SOURCE_PERIODIC;
+		if (copy_to_user(buf, &data->source, 4))
+			return -EFAULT;
+		buf += 4;
+	}
 
-		if (copy_to_user(buf, &source, 4))
+	if (sample_flags & SAMPLE_CTX_ID) {
+		if (copy_to_user(buf, &data->ctx_id, 4))
 			return -EFAULT;
 		buf += 4;
 	}
 
 	if (sample_flags & SAMPLE_OA_REPORT) {
-		if (copy_to_user(buf, report, report_size))
+		if (copy_to_user(buf, data->report, report_size))
 			return -EFAULT;
+		buf += report_size;
 	}
 
 	(*offset) += header.size;
@@ -680,11 +914,47 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 }
 
 /**
+ * append_oa_buffer_sample - Copies single periodic OA report into userspace
+ * read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ * @report: A single OA report to (optionally) include as part of the sample
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_oa_buffer_sample(struct i915_perf_stream *stream,
+				char __user *buf, size_t count,
+				size_t *offset,	const u8 *report)
+{
+	u32 sample_flags = stream->sample_flags;
+	struct oa_sample_data data = { 0 };
+
+	if (sample_flags & SAMPLE_OA_SOURCE_INFO)
+		data.source = I915_PERF_OA_EVENT_SOURCE_PERIODIC;
+
+	/*
+	 * FIXME: append_oa_buffer_sample: read ctx ID from report and map
+	 * that to a intel_context::hw_id"
+	 */
+	if (sample_flags & SAMPLE_CTX_ID)
+		data.ctx_id = 0;
+
+	if (sample_flags & SAMPLE_OA_REPORT)
+		data.report = report;
+
+	return append_oa_sample(stream, buf, count, offset, &data);
+}
+
+
+/**
  * Copies all buffered OA reports into userspace read() buffer.
  * @stream: An i915-perf stream opened for OA metrics
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -702,7 +972,8 @@ static int append_oa_sample(struct i915_perf_stream *stream,
 static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				  char __user *buf,
 				  size_t count,
-				  size_t *offset)
+				  size_t *offset,
+				  u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -716,7 +987,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 	u32 taken;
 	int ret = 0;
 
-	if (WARN_ON(!stream->enabled))
+	if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
 		return -EIO;
 
 	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
@@ -759,6 +1030,11 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		u32 *report32 = (void *)report;
 		u32 ctx_id;
 		u32 reason;
+		u32 report_ts = report32[1];
+
+		/* Report timestamp should not exceed the given ts */
+		if (report_ts > ts)
+			break;
 
 		/* All the report sizes factor neatly into the buffer
 		 * size so we never expect to see a report split
@@ -846,8 +1122,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 				report32[2] = INVALID_CTX_ID;
 			}
 
-			ret = append_oa_sample(stream, buf, count, offset,
-					       report);
+			ret = append_oa_buffer_sample(stream, buf, count,
+							offset, report);
 			if (ret)
 				break;
 
@@ -886,6 +1162,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Checks OA unit status registers and if necessary appends corresponding
  * status records for userspace (such as for a buffer full condition) and then
@@ -903,7 +1180,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 static int gen8_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
-			size_t *offset)
+			size_t *offset,
+			u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 oastatus;
@@ -953,7 +1231,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 		}
 	}
 
-	return gen8_append_oa_reports(stream, buf, count, offset);
+	return gen8_append_oa_reports(stream, buf, count, offset, ts);
 }
 
 /**
@@ -962,6 +1240,7 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Notably any error condition resulting in a short read (-%ENOSPC or
  * -%EFAULT) will be returned even though one or more records may
@@ -979,7 +1258,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 				  char __user *buf,
 				  size_t count,
-				  size_t *offset)
+				  size_t *offset,
+				  u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	int report_size = dev_priv->perf.oa.oa_buffer.format_size;
@@ -993,7 +1273,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 	u32 taken;
 	int ret = 0;
 
-	if (WARN_ON(!stream->enabled))
+	if (WARN_ON(stream->state != I915_PERF_STREAM_ENABLED))
 		return -EIO;
 
 	spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
@@ -1060,7 +1340,11 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 			continue;
 		}
 
-		ret = append_oa_sample(stream, buf, count, offset, report);
+		/* Report timestamp should not exceed the given ts */
+		if (report32[1] > ts)
+			break;
+
+		ret = append_oa_buffer_sample(stream, buf, count, offset, report);
 		if (ret)
 			break;
 
@@ -1099,6 +1383,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
  * @buf: destination buffer given by userspace
  * @count: the number of bytes userspace wants to read
  * @offset: (inout): the current position for writing into @buf
+ * @ts: copy OA reports till this timestamp
  *
  * Checks Gen 7 specific OA unit status registers and if necessary appends
  * corresponding status records for userspace (such as for a buffer full
@@ -1112,7 +1397,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream,
 static int gen7_oa_read(struct i915_perf_stream *stream,
 			char __user *buf,
 			size_t count,
-			size_t *offset)
+			size_t *offset,
+			u32 ts)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 	u32 oastatus1;
@@ -1174,7 +1460,159 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
 			GEN7_OASTATUS1_REPORT_LOST;
 	}
 
-	return gen7_append_oa_reports(stream, buf, count, offset);
+	return gen7_append_oa_reports(stream, buf, count, offset, ts);
+}
+
+/**
+ * append_oa_rcs_sample - Copies single RCS based OA report into userspace
+ * read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ * @node: Sample data associated with RCS based OA report
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int append_oa_rcs_sample(struct i915_perf_stream *stream,
+				char __user *buf,
+				size_t count,
+				size_t *offset,
+				struct i915_perf_cs_sample *node)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct oa_sample_data data = { 0 };
+	const u8 *report = dev_priv->perf.command_stream_buf.vaddr +
+				node->offset;
+	u32 sample_flags = stream->sample_flags;
+	u32 report_ts;
+	int ret;
+
+	/* First, append the periodic OA samples having lower timestamps */
+	report_ts = *(u32 *)(report + 4);
+	ret = dev_priv->perf.oa.ops.read(stream, buf, count, offset, report_ts);
+	if (ret)
+		return ret;
+
+	if (sample_flags & SAMPLE_OA_SOURCE_INFO)
+		data.source = I915_PERF_OA_EVENT_SOURCE_RCS;
+
+	if (sample_flags & SAMPLE_CTX_ID)
+		data.ctx_id = node->ctx_id;
+
+	if (sample_flags & SAMPLE_OA_REPORT)
+		data.report = report;
+
+	return append_oa_sample(stream, buf, count, offset, &data);
+}
+
+/**
+ * oa_rcs_append_reports: Copies all comand stream based OA reports into
+ * userspace read() buffer.
+ * @stream: An i915-perf stream opened for OA metrics
+ * @buf: destination buffer given by userspace
+ * @count: the number of bytes userspace wants to read
+ * @offset: (inout): the current position for writing into @buf
+ *
+ * Notably any error condition resulting in a short read (-%ENOSPC or
+ * -%EFAULT) will be returned even though one or more records may
+ * have been successfully copied. In this case it's up to the caller
+ * to decide if the error should be squashed before returning to
+ * userspace.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+static int oa_rcs_append_reports(struct i915_perf_stream *stream,
+				char __user *buf,
+				size_t count,
+				size_t *offset)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_cs_sample *entry, *next;
+	LIST_HEAD(free_list);
+	int ret = 0;
+
+	spin_lock(&dev_priv->perf.sample_lock);
+	if (list_empty(&dev_priv->perf.cs_samples)) {
+		spin_unlock(&dev_priv->perf.sample_lock);
+		return 0;
+	}
+	list_for_each_entry_safe(entry, next,
+				 &dev_priv->perf.cs_samples, link) {
+		if (!i915_gem_request_completed(entry->request))
+			break;
+		list_move_tail(&entry->link, &free_list);
+	}
+	spin_unlock(&dev_priv->perf.sample_lock);
+
+	if (list_empty(&free_list))
+		return 0;
+
+	list_for_each_entry_safe(entry, next, &free_list, link) {
+		ret = append_oa_rcs_sample(stream, buf, count, offset, entry);
+		if (ret)
+			break;
+
+		list_del(&entry->link);
+		i915_gem_request_put(entry->request);
+		kfree(entry);
+	}
+
+	/* Don't discard remaining entries, keep them for next read */
+	spin_lock(&dev_priv->perf.sample_lock);
+	list_splice(&free_list, &dev_priv->perf.cs_samples);
+	spin_unlock(&dev_priv->perf.sample_lock);
+
+	return ret;
+}
+
+/*
+ * command_stream_buf_is_empty - Checks whether the command stream buffer
+ * associated with the stream has data available.
+ * @stream: An i915-perf stream opened for OA metrics
+ *
+ * Returns: true if atleast one request associated with command stream is
+ * completed, else returns false.
+ */
+static bool command_stream_buf_is_empty(struct i915_perf_stream *stream)
+
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+	struct i915_perf_cs_sample *entry = NULL;
+	struct drm_i915_gem_request *request = NULL;
+
+	spin_lock(&dev_priv->perf.sample_lock);
+	entry = list_first_entry_or_null(&dev_priv->perf.cs_samples,
+			struct i915_perf_cs_sample, link);
+	if (entry)
+		request = entry->request;
+	spin_unlock(&dev_priv->perf.sample_lock);
+
+	if (!entry)
+		return true;
+	else if (!i915_gem_request_completed(request))
+		return true;
+	else
+		return false;
+}
+
+/**
+ * stream_have_data_unlocked - Checks whether the stream has data available
+ * @stream: An i915-perf stream opened for OA metrics
+ *
+ * For command stream based streams, check if the command stream buffer has
+ * atleast one sample available, if not return false, irrespective of periodic
+ * oa buffer having the data or not.
+ */
+
+static bool stream_have_data_unlocked(struct i915_perf_stream *stream)
+{
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	if (stream->cs_mode)
+		return !command_stream_buf_is_empty(stream);
+	else
+		return dev_priv->perf.oa.ops.oa_buffer_check(dev_priv);
 }
 
 /**
@@ -1199,8 +1637,22 @@ static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
 	if (!dev_priv->perf.oa.periodic)
 		return -EIO;
 
+	if (stream->cs_mode) {
+		int ret;
+
+		/*
+		 * Wait for the last submitted 'active' request.
+		 */
+		ret = i915_gem_active_wait(&stream->last_request,
+					   I915_WAIT_INTERRUPTIBLE);
+		if (ret) {
+			DRM_ERROR("Failed to wait for stream active request\n");
+			return ret;
+		}
+	}
+
 	return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
-					dev_priv->perf.oa.ops.oa_buffer_check(dev_priv));
+					stream_have_data_unlocked(stream));
 }
 
 /**
@@ -1241,7 +1693,12 @@ static int i915_oa_read(struct i915_perf_stream *stream,
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
+
+	if (stream->cs_mode)
+		return oa_rcs_append_reports(stream, buf, count, offset);
+	else
+		return dev_priv->perf.oa.ops.read(stream, buf, count, offset,
+						U32_MAX);
 }
 
 /**
@@ -1315,6 +1772,21 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
 }
 
 static void
+free_command_stream_buf(struct drm_i915_private *dev_priv)
+{
+	mutex_lock(&dev_priv->drm.struct_mutex);
+
+	i915_gem_object_unpin_map(dev_priv->perf.command_stream_buf.vma->obj);
+	i915_vma_unpin(dev_priv->perf.command_stream_buf.vma);
+	i915_gem_object_put(dev_priv->perf.command_stream_buf.vma->obj);
+
+	dev_priv->perf.command_stream_buf.vma = NULL;
+	dev_priv->perf.command_stream_buf.vaddr = NULL;
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+}
+
+static void
 free_oa_buffer(struct drm_i915_private *i915)
 {
 	mutex_lock(&i915->drm.struct_mutex);
@@ -1333,7 +1805,8 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
+	if (WARN_ON(stream != dev_priv->perf.oa.exclusive_stream))
+		return;
 
 	dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
 
@@ -1345,6 +1818,9 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 	if (stream->ctx)
 		oa_put_render_ctx_id(stream);
 
+	if (stream->cs_mode)
+		free_command_stream_buf(dev_priv);
+
 	dev_priv->perf.oa.exclusive_stream = NULL;
 }
 
@@ -1446,25 +1922,24 @@ static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
 	dev_priv->perf.oa.pollin = false;
 }
 
-static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+static int alloc_obj(struct drm_i915_private *dev_priv,
+		     struct i915_vma **vma, u8 **vaddr)
 {
 	struct drm_i915_gem_object *bo;
-	struct i915_vma *vma;
 	int ret;
 
-	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
-		return -ENODEV;
+	intel_runtime_pm_get(dev_priv);
 
 	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
 	if (ret)
-		return ret;
+		goto out;
 
 	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
 	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
 
 	bo = i915_gem_object_create(dev_priv, OA_BUFFER_SIZE);
 	if (IS_ERR(bo)) {
-		DRM_ERROR("Failed to allocate OA buffer\n");
+		DRM_ERROR("Failed to allocate i915 perf obj\n");
 		ret = PTR_ERR(bo);
 		goto unlock;
 	}
@@ -1474,42 +1949,83 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 		goto err_unref;
 
 	/* PreHSW required 512K alignment, HSW requires 16M */
-	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
-	if (IS_ERR(vma)) {
-		ret = PTR_ERR(vma);
+	*vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0);
+	if (IS_ERR(*vma)) {
+		ret = PTR_ERR(*vma);
 		goto err_unref;
 	}
-	dev_priv->perf.oa.oa_buffer.vma = vma;
 
-	dev_priv->perf.oa.oa_buffer.vaddr =
-		i915_gem_object_pin_map(bo, I915_MAP_WB);
-	if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
-		ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
+	*vaddr = i915_gem_object_pin_map(bo, I915_MAP_WB);
+	if (IS_ERR(*vaddr)) {
+		ret = PTR_ERR(*vaddr);
 		goto err_unpin;
 	}
 
-	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
-
-	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p\n",
-			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
-			 dev_priv->perf.oa.oa_buffer.vaddr);
-
 	goto unlock;
 
 err_unpin:
-	__i915_vma_unpin(vma);
+	i915_vma_unpin(*vma);
 
 err_unref:
 	i915_gem_object_put(bo);
 
-	dev_priv->perf.oa.oa_buffer.vaddr = NULL;
-	dev_priv->perf.oa.oa_buffer.vma = NULL;
-
 unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
+out:
+	intel_runtime_pm_put(dev_priv);
 	return ret;
 }
 
+static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
+{
+	struct i915_vma *vma;
+	u8 *vaddr;
+	int ret;
+
+	if (WARN_ON(dev_priv->perf.oa.oa_buffer.vma))
+		return -ENODEV;
+
+	ret = alloc_obj(dev_priv, &vma, &vaddr);
+	if (ret)
+		return ret;
+
+	dev_priv->perf.oa.oa_buffer.vma = vma;
+	dev_priv->perf.oa.oa_buffer.vaddr = vaddr;
+
+	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
+
+	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
+			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
+			 dev_priv->perf.oa.oa_buffer.vaddr);
+	return 0;
+}
+
+static int alloc_command_stream_buf(struct drm_i915_private *dev_priv)
+{
+	struct i915_vma *vma;
+	u8 *vaddr;
+	int ret;
+
+	if (WARN_ON(dev_priv->perf.command_stream_buf.vma))
+		return -ENODEV;
+
+	ret = alloc_obj(dev_priv, &vma, &vaddr);
+	if (ret)
+		return ret;
+
+	dev_priv->perf.command_stream_buf.vma = vma;
+	dev_priv->perf.command_stream_buf.vaddr = vaddr;
+	if (WARN_ON(!list_empty(&dev_priv->perf.cs_samples)))
+		INIT_LIST_HEAD(&dev_priv->perf.cs_samples);
+
+	DRM_DEBUG_DRIVER(
+		"command stream buf initialized, gtt offset = 0x%x, vaddr = %p",
+		 i915_ggtt_offset(dev_priv->perf.command_stream_buf.vma),
+		 dev_priv->perf.command_stream_buf.vaddr);
+
+	return 0;
+}
+
 static void config_oa_regs(struct drm_i915_private *dev_priv,
 			   const struct i915_oa_reg *regs,
 			   int n_regs)
@@ -1848,7 +2364,8 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
 {
 	assert_spin_locked(&dev_priv->perf.hook_lock);
 
-	if (dev_priv->perf.oa.exclusive_stream->enabled) {
+	if (dev_priv->perf.oa.exclusive_stream->state !=
+					I915_PERF_STREAM_DISABLED) {
 		struct i915_gem_context *ctx =
 			dev_priv->perf.oa.exclusive_stream->ctx;
 		u32 ctx_id = dev_priv->perf.oa.specific_ctx_id;
@@ -1954,10 +2471,20 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
 
-	dev_priv->perf.oa.ops.oa_disable(dev_priv);
-
 	if (dev_priv->perf.oa.periodic)
 		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
+
+	if (stream->cs_mode) {
+		/*
+		 * Wait for the last submitted 'active' request, before freeing
+		 * the requests associated with the stream.
+		 */
+		i915_gem_active_wait(&stream->last_request,
+					   I915_WAIT_INTERRUPTIBLE);
+		i915_oa_rcs_release_samples(dev_priv);
+	}
+
+	dev_priv->perf.oa.ops.oa_disable(dev_priv);
 }
 
 static const struct i915_perf_stream_ops i915_oa_stream_ops = {
@@ -1967,6 +2494,7 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 	.wait_unlocked = i915_oa_wait_unlocked,
 	.poll_wait = i915_oa_poll_wait,
 	.read = i915_oa_read,
+	.command_stream_hook = i915_perf_command_stream_hook_oa,
 };
 
 /**
@@ -1992,28 +2520,11 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			       struct perf_open_properties *props)
 {
 	struct drm_i915_private *dev_priv = stream->dev_priv;
-	int format_size;
+	bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT |
+						      SAMPLE_OA_SOURCE_INFO);
+	bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT;
 	int ret;
 
-	/* If the sysfs metrics/ directory wasn't registered for some
-	 * reason then don't let userspace try their luck with config
-	 * IDs
-	 */
-	if (!dev_priv->perf.metrics_kobj) {
-		DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
-		return -EINVAL;
-	}
-
-	if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
-		DRM_DEBUG("Only OA report sampling supported\n");
-		return -EINVAL;
-	}
-
-	if (!dev_priv->perf.oa.ops.init_oa_buffer) {
-		DRM_DEBUG("OA unit not supported\n");
-		return -ENODEV;
-	}
-
 	/* To avoid the complexity of having to accurately filter
 	 * counter reports and marshal to the appropriate client
 	 * we currently only allow exclusive access
@@ -2023,80 +2534,154 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 		return -EBUSY;
 	}
 
-	if (!props->metrics_set) {
-		DRM_DEBUG("OA metric set not specified\n");
-		return -EINVAL;
-	}
-
-	if (!props->oa_format) {
-		DRM_DEBUG("OA report format not specified\n");
-		return -EINVAL;
+	if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) {
+		if (IS_HASWELL(dev_priv)) {
+			DRM_ERROR(
+				"On HSW, context ID sampling only supported via command stream");
+			return -EINVAL;
+		} else if (!i915.enable_execlists) {
+			DRM_ERROR(
+				"On Gen8+ without execlists, context ID sampling only supported via command stream");
+			return -EINVAL;
+		}
 	}
 
 	stream->sample_size = sizeof(struct drm_i915_perf_record_header);
 
-	format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
+	if (require_oa_unit) {
+		int format_size;
 
-	stream->sample_flags |= SAMPLE_OA_REPORT;
-	stream->sample_size += format_size;
+		/* If the sysfs metrics/ directory wasn't registered for some
+		 * reason then don't let userspace try their luck with config
+		 * IDs
+		 */
+		if (!dev_priv->perf.metrics_kobj) {
+			DRM_DEBUG("OA metrics weren't advertised via sysfs\n");
+			return -EINVAL;
+		}
 
-	if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) {
-		stream->sample_flags |= SAMPLE_OA_SOURCE_INFO;
-		stream->sample_size += 4;
-	}
+		if (!dev_priv->perf.oa.ops.init_oa_buffer) {
+			DRM_DEBUG("OA unit not supported\n");
+			return -ENODEV;
+		}
 
-	dev_priv->perf.oa.oa_buffer.format_size = format_size;
-	if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
-		return -EINVAL;
+		if (!props->metrics_set) {
+			DRM_DEBUG("OA metric set not specified\n");
+			return -EINVAL;
+		}
 
-	dev_priv->perf.oa.oa_buffer.format =
-		dev_priv->perf.oa.oa_formats[props->oa_format].format;
+		if (!props->oa_format) {
+			DRM_DEBUG("OA report format not specified\n");
+			return -EINVAL;
+		}
 
-	dev_priv->perf.oa.metrics_set = props->metrics_set;
+		if (props->cs_mode  && (props->engine!= RCS)) {
+			DRM_ERROR(
+				  "Command stream OA metrics only available via Render CS\n");
+			return -EINVAL;
+		}
+		stream->engine= RCS;
 
-	dev_priv->perf.oa.periodic = props->oa_periodic;
-	if (dev_priv->perf.oa.periodic)
-		dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+		format_size =
+			dev_priv->perf.oa.oa_formats[props->oa_format].size;
+
+		if (props->sample_flags & SAMPLE_OA_REPORT) {
+			stream->sample_flags |= SAMPLE_OA_REPORT;
+			stream->sample_size += format_size;
+		}
+
+		if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) {
+			if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
+				DRM_ERROR(
+					  "OA source type can't be sampled without OA report");
+				return -EINVAL;
+			}
+			stream->sample_flags |= SAMPLE_OA_SOURCE_INFO;
+			stream->sample_size += 4;
+		}
+
+		dev_priv->perf.oa.oa_buffer.format_size = format_size;
+		if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0))
+			return -EINVAL;
+
+		dev_priv->perf.oa.oa_buffer.format =
+			dev_priv->perf.oa.oa_formats[props->oa_format].format;
+
+		dev_priv->perf.oa.metrics_set = props->metrics_set;
+
+		dev_priv->perf.oa.periodic = props->oa_periodic;
+		if (dev_priv->perf.oa.periodic)
+			dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
+
+		if (stream->ctx) {
+			ret = oa_get_render_ctx_id(stream);
+			if (ret)
+				return ret;
+		}
 
-	if (stream->ctx) {
-		ret = oa_get_render_ctx_id(stream);
+		ret = alloc_oa_buffer(dev_priv);
 		if (ret)
-			return ret;
+			goto err_oa_buf_alloc;
+
+		/* PRM - observability performance counters:
+		 *
+		 *   OACONTROL, performance counter enable, note:
+		 *
+		 *   "When this bit is set, in order to have coherent counts,
+		 *   RC6 power state and trunk clock gating must be disabled.
+		 *   This can be achieved by programming MMIO registers as
+		 *   0xA094=0 and 0xA090[31]=1"
+		 *
+		 *   In our case we are expecting that taking pm + FORCEWAKE
+		 *   references will effectively disable RC6.
+		 */
+		intel_runtime_pm_get(dev_priv);
+		intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+		ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
+		if (ret)
+			goto err_enable;
+
 	}
 
-	ret = alloc_oa_buffer(dev_priv);
-	if (ret)
-		goto err_oa_buf_alloc;
+	if (props->sample_flags & SAMPLE_CTX_ID) {
+		stream->sample_flags |= SAMPLE_CTX_ID;
+		stream->sample_size += 4;
+	}
 
-	/* PRM - observability performance counters:
-	 *
-	 *   OACONTROL, performance counter enable, note:
-	 *
-	 *   "When this bit is set, in order to have coherent counts,
-	 *   RC6 power state and trunk clock gating must be disabled.
-	 *   This can be achieved by programming MMIO registers as
-	 *   0xA094=0 and 0xA090[31]=1"
-	 *
-	 *   In our case we are expecting that taking pm + FORCEWAKE
-	 *   references will effectively disable RC6.
-	 */
-	intel_runtime_pm_get(dev_priv);
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+	if (props->cs_mode) {
+		if (!cs_sample_data) {
+			DRM_ERROR(
+				"Stream engine given without requesting any CS data to sample");
+			ret = -EINVAL;
+			goto err_enable;
+		}
 
-	ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
-	if (ret)
-		goto err_enable;
+		if (!(props->sample_flags & SAMPLE_CTX_ID)) {
+			DRM_ERROR(
+				"Stream engine given without requesting any CS specific property");
+			ret = -EINVAL;
+			goto err_enable;
+		}
+		stream->cs_mode = true;
+		ret = alloc_command_stream_buf(dev_priv);
+		if (ret)
+			goto err_enable;
 
-	stream->ops = &i915_oa_stream_ops;
+		init_request_active(&stream->last_request, NULL);
+	}
 
+	stream->ops = &i915_oa_stream_ops;
 	dev_priv->perf.oa.exclusive_stream = stream;
 
 	return 0;
 
 err_enable:
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-	intel_runtime_pm_put(dev_priv);
-	free_oa_buffer(dev_priv);
+	if (require_oa_unit) {
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+		intel_runtime_pm_put(dev_priv);
+		free_oa_buffer(dev_priv);
+	}
 
 err_oa_buf_alloc:
 	if (stream->ctx)
@@ -2265,7 +2850,7 @@ static ssize_t i915_perf_read(struct file *file,
 	 * disabled stream as an error. In particular it might otherwise lead
 	 * to a deadlock for blocking file descriptors...
 	 */
-	if (!stream->enabled)
+	if (stream->state == I915_PERF_STREAM_DISABLED)
 		return -EIO;
 
 	if (!(file->f_flags & O_NONBLOCK)) {
@@ -2312,13 +2897,21 @@ static ssize_t i915_perf_read(struct file *file,
 
 static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
 {
+	struct i915_perf_stream *stream;
 	struct drm_i915_private *dev_priv =
 		container_of(hrtimer, typeof(*dev_priv),
 			     perf.oa.poll_check_timer);
 
-	if (dev_priv->perf.oa.ops.oa_buffer_check(dev_priv)) {
-		dev_priv->perf.oa.pollin = true;
-		wake_up(&dev_priv->perf.oa.poll_wq);
+	/* No need to protect the streams list here, since the hrtimer is
+	 * disabled before the stream is removed from list, and currently a
+	 * single exclusive_stream is supported.
+	 * XXX: revisit this when multiple concurrent streams are supported.
+	 */
+	list_for_each_entry(stream, &dev_priv->perf.streams, link) {
+		if (stream_have_data_unlocked(stream)) {
+			dev_priv->perf.oa.pollin = true;
+			wake_up(&dev_priv->perf.oa.poll_wq);
+		}
 	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
@@ -2401,14 +2994,16 @@ static unsigned int i915_perf_poll(struct file *file, poll_table *wait)
  */
 static void i915_perf_enable_locked(struct i915_perf_stream *stream)
 {
-	if (stream->enabled)
+	if (stream->state != I915_PERF_STREAM_DISABLED)
 		return;
 
 	/* Allow stream->ops->enable() to refer to this */
-	stream->enabled = true;
+	stream->state = I915_PERF_STREAM_ENABLE_IN_PROGRESS;
 
 	if (stream->ops->enable)
 		stream->ops->enable(stream);
+
+	stream->state = I915_PERF_STREAM_ENABLED;
 }
 
 /**
@@ -2427,11 +3022,11 @@ static void i915_perf_enable_locked(struct i915_perf_stream *stream)
  */
 static void i915_perf_disable_locked(struct i915_perf_stream *stream)
 {
-	if (!stream->enabled)
+	if (stream->state != I915_PERF_STREAM_ENABLED)
 		return;
 
 	/* Allow stream->ops->disable() to refer to this */
-	stream->enabled = false;
+	stream->state = I915_PERF_STREAM_DISABLED;
 
 	if (stream->ops->disable)
 		stream->ops->disable(stream);
@@ -2503,14 +3098,18 @@ static long i915_perf_ioctl(struct file *file,
  */
 static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
 {
-	if (stream->enabled)
+	struct drm_i915_private *dev_priv = stream->dev_priv;
+
+	mutex_lock(&dev_priv->perf.streams_lock);
+	list_del(&stream->link);
+	mutex_unlock(&dev_priv->perf.streams_lock);
+
+	if (stream->state == I915_PERF_STREAM_ENABLED)
 		i915_perf_disable_locked(stream);
 
 	if (stream->ops->destroy)
 		stream->ops->destroy(stream);
 
-	list_del(&stream->link);
-
 	if (stream->ctx)
 		i915_gem_context_put_unlocked(stream->ctx);
 
@@ -2673,7 +3272,9 @@ static int i915_perf_release(struct inode *inode, struct file *file)
 		goto err_alloc;
 	}
 
+	mutex_lock(&dev_priv->perf.streams_lock);
 	list_add(&stream->link, &dev_priv->perf.streams);
+	mutex_unlock(&dev_priv->perf.streams_lock);
 
 	if (param->flags & I915_PERF_FLAG_FD_CLOEXEC)
 		f_flags |= O_CLOEXEC;
@@ -2692,7 +3293,9 @@ static int i915_perf_release(struct inode *inode, struct file *file)
 	return stream_fd;
 
 err_open:
+	mutex_lock(&dev_priv->perf.streams_lock);
 	list_del(&stream->link);
+	mutex_unlock(&dev_priv->perf.streams_lock);
 	if (stream->ops->destroy)
 		stream->ops->destroy(stream);
 err_alloc:
@@ -2832,6 +3435,30 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE:
 			props->sample_flags |= SAMPLE_OA_SOURCE_INFO;
 			break;
+		case DRM_I915_PERF_PROP_ENGINE: {
+				unsigned int user_ring_id =
+					value & I915_EXEC_RING_MASK;
+				enum intel_engine_id engine;
+
+				if (user_ring_id > I915_USER_RINGS)
+					return -EINVAL;
+
+				/* XXX: Currently only RCS is supported.
+				 * Remove this check when support for other
+				 * engines is added
+				 */
+				engine = user_ring_map[user_ring_id];
+				if (engine != RCS)
+					return -EINVAL;
+
+				props->cs_mode = true;
+				props->engine = engine;
+			}
+			break;
+		case DRM_I915_PERF_PROP_SAMPLE_CTX_ID:
+			props->sample_flags |= SAMPLE_CTX_ID;
+			break;
+
 		default:
 			MISSING_CASE(id);
 			DRM_DEBUG("Unknown i915 perf property ID\n");
@@ -3140,8 +3767,11 @@ void i915_perf_init(struct drm_i915_private *dev_priv)
 		init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
 
 		INIT_LIST_HEAD(&dev_priv->perf.streams);
+		INIT_LIST_HEAD(&dev_priv->perf.cs_samples);
 		mutex_init(&dev_priv->perf.lock);
+		mutex_init(&dev_priv->perf.streams_lock);
 		spin_lock_init(&dev_priv->perf.hook_lock);
+		spin_lock_init(&dev_priv->perf.sample_lock);
 		spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
 
 		dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index c597e36..4cbefc8 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1314,6 +1314,7 @@ enum drm_i915_oa_format {
 
 enum drm_i915_perf_oa_event_source {
 	I915_PERF_OA_EVENT_SOURCE_PERIODIC,
+	I915_PERF_OA_EVENT_SOURCE_RCS,
 
 	I915_PERF_OA_EVENT_SOURCE_MAX	/* non-ABI */
 };
@@ -1359,6 +1360,19 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE,
 
+	/**
+	 * The value of this property specifies the GPU engine for which
+	 * the samples need to be collected. Specifying this property also
+	 * implies the command stream based sample collection.
+	 */
+	DRM_I915_PERF_PROP_ENGINE,
+
+	/**
+	 * The value of this property set to 1 requests inclusion of context ID
+	 * in the perf sample data.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_CTX_ID,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1425,6 +1439,7 @@ enum drm_i915_perf_record_type {
 	 *     struct drm_i915_perf_record_header header;
 	 *
 	 *     { u32 source_info; } && DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE
+	 *     { u32 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID
 	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
 	 * };
 	 */
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx