[PATCH 11/14] drm/i915: Add support for collecting timestamps on all gpu engines

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Sourab Gupta <sourab.gupta@xxxxxxxxx>

With this patch, for RCS, timestamps and OA reports can be collected
together, and provided to userspace in separate sample fields. For other
engines, the capabilility to collect timestamps is added.

The thing to note is that, still only a single stream instance can be
opened at any particular time. Though that stream may now be opened for any
gpu engine, for collection of timestamp samples.

So, this patch doesn't add the support to open multiple concurrent streams,
as yet.

v2: Patching the offsets for TS capture similar to OA.

Testcase: igt/intel_perf_dapc/perf-ts
Signed-off-by: Sourab Gupta <sourab.gupta@xxxxxxxxx>
Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@xxxxxxxxx>
---
 drivers/gpu/drm/i915/i915_drv.h         |  18 ++-
 drivers/gpu/drm/i915/i915_gem_request.h |   2 +
 drivers/gpu/drm/i915/i915_perf.c        | 223 +++++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_reg.h         |   2 +
 include/uapi/drm/i915_drm.h             |   7 +
 5 files changed, 230 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8bd8c0a..2d5f20a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2260,12 +2260,24 @@ struct i915_perf_cs_sample {
 	struct drm_i915_gem_request *request;
 
 	/**
-	 * @oa_offset: Offset into ``&stream->cs_buffer``
-	 * where the perf metrics will be collected, when the commands inserted
-	 * into the command stream are executed by GPU.
+	 * @oa_offset: Offset into ``&stream->cs_buffer
+	 * where the OA report will be collected (if the stream is configured
+	 * for collection of OA samples).
 	 */
 	u32 oa_offset;
 
+	/**
+	 * @ts_offset: Offset into ``&stream->cs_buffer
+	 * where the timestamps will be collected (if the stream is configured
+	 * for collection of timestamp data)
+	 */
+	u32 ts_offset;
+
+	/**
+	 * @size: buffer size corresponding to this perf sample
+	 */
+	u32 size;
+
 	/* Is this sample prior to request start or post request end */
 	enum request_sample_id id;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index a2535c6..691a0eb 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -200,6 +200,8 @@ struct drm_i915_gem_request {
 	u32 *post_oa_offset;
 	u64 pid;
 	u32 tag;
+	u32 *pre_ts_offset;
+	u32 *post_ts_offset;
 };
 
 extern const struct dma_fence_ops i915_fence_ops;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index aad8b23..8243246 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -292,12 +292,17 @@
 #define OAREPORT_REASON_CTX_SWITCH     (1<<3)
 #define OAREPORT_REASON_CLK_RATIO      (1<<5)
 
-/* Data common to periodic and RCS based OA samples */
+#define OA_ADDR_ALIGN 64
+#define TS_ADDR_ALIGN 8
+#define I915_PERF_TS_SAMPLE_SIZE 8
+
+/*Data common to perf samples (periodic OA / CS based OA / Timestamps)*/
 struct i915_perf_sample_data {
 	u64 source;
 	u64 ctx_id;
 	u64 pid;
 	u64 tag;
+	u64 ts;
 	const u8 *report;
 };
 
@@ -355,6 +360,7 @@ struct i915_perf_sample_data {
 #define SAMPLE_CTX_ID	      (1<<2)
 #define SAMPLE_PID	      (1<<3)
 #define SAMPLE_TAG	      (1<<4)
+#define SAMPLE_TS	      (1<<5)
 
 /**
  * struct perf_open_properties - for validated properties given to open a stream
@@ -498,6 +504,86 @@ static int i915_emit_oa_report_capture(struct drm_i915_gem_request *request,
 }
 
 /**
+ * i915_emit_ts_capture - Insert the commands to capture timestamp
+ * data into the GPU command stream
+ * @request: request in whose context the timestamps are being collected.
+ * @preallocate: allocate space in ring for related sample.
+ */
+static int i915_emit_ts_capture(struct drm_i915_gem_request *request,
+				bool preallocate)
+{
+	struct drm_i915_private *dev_priv = request->i915;
+	u32 cmd, len = 6, *cs;
+
+	if (preallocate)
+		request->reserved_space += len;
+	else
+		request->reserved_space -= len;
+
+	cs = intel_ring_begin(request, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	if (request->engine->id == RCS) {
+		if (INTEL_GEN(dev_priv) >= 8)
+			cmd = GFX_OP_PIPE_CONTROL(6);
+		else
+			cmd = GFX_OP_PIPE_CONTROL(5);
+
+		*cs++ = cmd;
+		*cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB |
+				PIPE_CONTROL_TIMESTAMP_WRITE;
+		/*
+		 * Save the address in the ringbuffer where offset for OA report
+		 * capture is to be placed during __i915_gem_request_submit.
+		 */
+		if (preallocate)
+			request->pre_ts_offset = cs++;
+		else
+			request->post_ts_offset = cs++;
+
+		*cs++ = 0;
+		*cs++ = 0;
+
+		if (INTEL_GEN(dev_priv) >= 8)
+			*cs++ = 0;
+		else
+			*cs++ = MI_NOOP;
+	} else {
+		uint32_t cmd;
+
+		cmd = MI_FLUSH_DW + 1;
+		if (INTEL_GEN(dev_priv) >= 8)
+			cmd += 1;
+
+		cmd |= MI_FLUSH_DW_OP_STAMP;
+
+		*cs++ = cmd;
+		/*
+		 * Save the address in the ringbuffer where offset for OA report
+		 * capture is to be placed during __i915_gem_request_submit.
+		 */
+		if (preallocate)
+			request->pre_ts_offset = cs++;
+		else
+			request->post_ts_offset = cs++;
+
+		*cs++ = 0;
+		*cs++ = 0;
+
+		if (INTEL_GEN(dev_priv) >= 8)
+			*cs++ = 0;
+		else
+			*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+	}
+
+	intel_ring_advance(request, cs);
+
+	return 0;
+}
+
+/**
  * i915_perf_stream_emit_sample_capture - Insert the commands to capture perf
  * metrics into the GPU command stream
  * @stream: Stream to which this request corresponds.
@@ -519,6 +605,15 @@ static void i915_perf_stream_emit_sample_capture(
 		ret = i915_emit_oa_report_capture(request, preallocate);
 		if (ret)
 			DRM_ERROR("Emit of OA capture commands failed\n");
+	} else if (stream->sample_flags & SAMPLE_TS) {
+		/*
+		 * XXX: Since TS data can anyways be derived from OA report, so
+		 * no need to capture it for RCS engine, if capture oa data is
+		 * called already.
+		 */
+		ret = i915_emit_ts_capture(request, preallocate);
+		if (ret)
+			DRM_ERROR("Emit of TS capture commands failed\n");
 	}
 
 	if (stream->sample_flags & SAMPLE_PID)
@@ -615,6 +710,34 @@ static void i915_perf_stream_patch_sample_oa(struct i915_perf_stream *stream,
 	}
 }
 
+static void i915_perf_stream_patch_sample_ts(struct i915_perf_stream *stream,
+					struct drm_i915_gem_request *request,
+					struct i915_perf_cs_sample *sample)
+{
+	u32 ts_addr = stream->cs_buffer.vma->node.start + sample->ts_offset;
+
+	switch (sample->id) {
+	case PRE_REQUEST_SAMPLE_ID:
+		if (request->engine->id == RCS)
+			*request->pre_ts_offset = ts_addr |
+						  PIPE_CONTROL_GLOBAL_GTT;
+		else
+			*request->pre_ts_offset = ts_addr |
+						  MI_FLUSH_DW_USE_GTT;
+		break;
+	case POST_REQUEST_SAMPLE_ID:
+		if (request->engine->id == RCS)
+			*request->post_ts_offset = ts_addr |
+						   PIPE_CONTROL_GLOBAL_GTT;
+		else
+			*request->post_ts_offset = ts_addr |
+						   MI_FLUSH_DW_USE_GTT;
+		break;
+	default:
+		DRM_ERROR("Invalid sample being patched\n");
+	}
+}
+
 /**
  * i915_perf_stream_patch_request - Assign free sample. If none available,
  * remove one. Patch offset of the perf sample address with the one from
@@ -650,6 +773,10 @@ static void i915_perf_stream_patch_request(struct i915_perf_stream *stream,
 		    (SAMPLE_OA_REPORT | SAMPLE_OA_SOURCE))
 			i915_perf_stream_patch_sample_oa(stream, request,
 							 sample);
+		else if (stream->sample_flags & SAMPLE_TS)
+			i915_perf_stream_patch_sample_ts(stream, request,
+							 sample);
+
 		spin_unlock_irqrestore(&stream->samples_lock, flags);
 		sample_id++;
 	}
@@ -976,6 +1103,12 @@ static int append_perf_sample(struct i915_perf_stream *stream,
 		buf += 8;
 	}
 
+	if (sample_flags & SAMPLE_TS) {
+		if (copy_to_user(buf, &data->ts, I915_PERF_TS_SAMPLE_SIZE))
+			return -EFAULT;
+		buf += I915_PERF_TS_SAMPLE_SIZE;
+	}
+
 	if (sample_flags & SAMPLE_OA_REPORT) {
 		if (copy_to_user(buf, data->report, report_size))
 			return -EFAULT;
@@ -1019,6 +1152,12 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream,
 	if (sample_flags & SAMPLE_TAG)
 		data.tag = stream->last_tag;
 
+	/* TODO: Derive timestamp from OA report,
+	 * after scaling with the ts base
+	 */
+	if (sample_flags & SAMPLE_TS)
+		data.ts = 0;
+
 	if (sample_flags & SAMPLE_OA_REPORT)
 		data.report = report;
 
@@ -1643,6 +1782,19 @@ static int append_cs_buffer_sample(struct i915_perf_stream *stream,
 			stream->last_tag = INVALID_TAG;
 	}
 
+	if (sample_flags & SAMPLE_TS) {
+		/* For RCS, if OA samples are also being collected, derive the
+		 * timestamp from OA report, after scaling with the TS base.
+		 * Else, forward the timestamp collected via command stream.
+		 */
+		/* TODO: derive the timestamp from OA report */
+		if (sample_flags & SAMPLE_OA_REPORT)
+			data.ts = 0;
+		else
+			data.ts = *(u64 *) (stream->cs_buffer.vaddr +
+					   node->ts_offset);
+	}
+
 	return append_perf_sample(stream, buf, count, offset, &data);
 }
 
@@ -2257,11 +2409,21 @@ static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
 
 static int init_perf_samples(struct i915_perf_stream *stream)
 {
+	struct drm_i915_private *dev_priv = stream->dev_priv;
 	struct i915_perf_cs_sample *sample;
 	u32 sample_size = 0;
 	u32 offset = 0;
 
-	sample_size = stream->dev_priv->perf.oa.oa_buffer.format_size;
+	if (stream->sample_flags & SAMPLE_OA_REPORT)
+		sample_size += dev_priv->perf.oa.oa_buffer.format_size;
+	else if (stream->sample_flags & SAMPLE_TS) {
+		/*
+		 * XXX: Since TS data can anyways be derived from OA report, so
+		 * no need to capture it for RCS engine, if capture oa data is
+		 * called already.
+		 */
+		sample_size += I915_PERF_TS_SAMPLE_SIZE;
+	}
 
 	while ((offset + sample_size) < stream->cs_buffer.vma->size) {
 		sample = kzalloc(sizeof(*sample), GFP_KERNEL);
@@ -2269,9 +2431,22 @@ static int init_perf_samples(struct i915_perf_stream *stream)
 			DRM_ERROR("Perf sample alloc failed\n");
 			return -ENOMEM;
 		}
-		sample->oa_offset = offset;
+		if (stream->sample_flags & SAMPLE_OA_REPORT) {
+			sample->oa_offset = offset;
+			/* Ensure 64 byte alignment of oa_offset */
+			sample->oa_offset = ALIGN(sample->oa_offset,
+						  OA_ADDR_ALIGN);
+			offset = sample->oa_offset +
+				 dev_priv->perf.oa.oa_buffer.format_size;
+		} else if (stream->sample_flags & SAMPLE_TS) {
+			sample->ts_offset = offset;
+			/* Ensure 8 byte alignment of ts_offset */
+			sample->ts_offset = ALIGN(sample->ts_offset,
+						  TS_ADDR_ALIGN);
+			offset = sample->ts_offset + I915_PERF_TS_SAMPLE_SIZE;
+		}
+
 		list_add_tail(&sample->link, &stream->free_samples);
-		offset += sample_size;
 	}
 
 	return 0;
@@ -2862,7 +3037,8 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream,
 	int format_size, idx;
 	bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT |
 						      SAMPLE_OA_SOURCE);
-	bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT;
+	bool cs_sample_data = props->sample_flags & (SAMPLE_OA_REPORT |
+						     SAMPLE_TS);
 	bool require_cs_mode = props->sample_flags & (SAMPLE_PID |
 						      SAMPLE_TAG);
 	struct i915_perf_stream *curr_stream;
@@ -3026,8 +3202,22 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream,
 			require_cs_mode = true;
 	}
 
+	if (props->sample_flags & SAMPLE_TS) {
+		stream->sample_flags |= SAMPLE_TS;
+		stream->sample_size += I915_PERF_TS_SAMPLE_SIZE;
+
+		/*
+		 * NB: it's meaningful to request SAMPLE_TS with just CS
+		 * mode or periodic OA mode sampling but we don't allow
+		 * SAMPLE_TS without either mode
+		 */
+		if (!require_oa_unit)
+			require_cs_mode = true;
+	}
+
 	if (require_cs_mode && !props->cs_mode) {
-		DRM_ERROR("PID/TAG sampling requires a ring to be specified");
+		DRM_ERROR("PID/TAG/TS sampling requires engine "
+			  "to be specified");
 		ret = -EINVAL;
 		goto err_enable;
 	}
@@ -3043,11 +3233,12 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream,
 
 		/*
 		 * The only time we should allow enabling CS mode if it's not
-		 * strictly required, is if SAMPLE_CTX_ID has been requested
-		 * as it's usable with periodic OA or CS sampling.
+		 * strictly required, is if SAMPLE_CTX_ID/SAMPLE_TS has been
+		 * requested as they're usable with periodic OA or CS sampling.
 		 */
 		if (!require_cs_mode &&
-		    !(props->sample_flags & SAMPLE_CTX_ID)) {
+		    !(props->sample_flags & (SAMPLE_CTX_ID | SAMPLE_TS))) {
+
 			DRM_ERROR("Stream engine given without requesting any "
 				  "CS specific property\n");
 			ret = -EINVAL;
@@ -3770,21 +3961,12 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_ENGINE: {
 				unsigned int user_ring_id =
 					value & I915_EXEC_RING_MASK;
-				enum intel_engine_id engine;
 
 				if (user_ring_id > I915_USER_RINGS)
 					return -EINVAL;
 
-				/* XXX: Currently only RCS is supported.
-				 * Remove this check when support for other
-				 * engines is added
-				 */
-				engine = user_ring_map[user_ring_id];
-				if (engine != RCS)
-					return -EINVAL;
-
 				props->cs_mode = true;
-				props->engine = engine;
+				props->engine = user_ring_map[user_ring_id];
 			}
 			break;
 		case DRM_I915_PERF_PROP_SAMPLE_CTX_ID:
@@ -3796,6 +3978,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv,
 		case DRM_I915_PERF_PROP_SAMPLE_TAG:
 			props->sample_flags |= SAMPLE_TAG;
 			break;
+		case DRM_I915_PERF_PROP_SAMPLE_TS:
+			props->sample_flags |= SAMPLE_TS;
+			break;
 		case DRM_I915_PERF_PROP_MAX:
 			MISSING_CASE(id);
 			return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index c718c2f..a24d391 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -547,6 +547,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
 #define   MI_INVALIDATE_TLB		(1<<18)
 #define   MI_FLUSH_DW_OP_STOREDW	(1<<14)
+#define   MI_FLUSH_DW_OP_STAMP		(3<<14)
 #define   MI_FLUSH_DW_OP_MASK		(3<<14)
 #define   MI_FLUSH_DW_NOTIFY		(1<<8)
 #define   MI_INVALIDATE_BSD		(1<<7)
@@ -630,6 +631,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define   PIPE_CONTROL_TLB_INVALIDATE			(1<<18)
 #define   PIPE_CONTROL_MEDIA_STATE_CLEAR		(1<<16)
 #define   PIPE_CONTROL_QW_WRITE				(1<<14)
+#define   PIPE_CONTROL_TIMESTAMP_WRITE			(3<<14)
 #define   PIPE_CONTROL_POST_SYNC_OP_MASK                (3<<14)
 #define   PIPE_CONTROL_DEPTH_STALL			(1<<13)
 #define   PIPE_CONTROL_WRITE_FLUSH			(1<<12)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 71c102e..257418b 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1459,6 +1459,12 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_SAMPLE_TAG,
 
+	/**
+	 * The value of this property set to 1 requests inclusion of timestamp
+	 * in the perf sample data.
+	 */
+	DRM_I915_PERF_PROP_SAMPLE_TS,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
@@ -1528,6 +1534,7 @@ enum drm_i915_perf_record_type {
 	 *     { u64 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID
 	 *     { u64 pid; } && DRM_I915_PERF_PROP_SAMPLE_PID
 	 *     { u64 tag; } && DRM_I915_PERF_PROP_SAMPLE_TAG
+	 *     { u64 timestamp; } && DRM_I915_PERF_PROP_SAMPLE_TS
 	 *     { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA
 	 * };
 	 */
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux