[PATCH 3/3] intel_perf_counters: Add support for Sandybridge.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



While the Sandybridge PRM doesn't have any documentation on the GPU's
performance counters, a lot of information can be gleaned from the older
Ironlake PRM.  Oddly, none of the information documented there actually
appears to apply to Ironlake.  However, it apparently works just great
on Sandybridge.

Since this information has all been publicly available on the internet
for around three years, we can use it.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 tools/intel_perf_counters.c | 146 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c
index fd268b1..b528361 100644
--- a/tools/intel_perf_counters.c
+++ b/tools/intel_perf_counters.c
@@ -22,9 +22,21 @@
  *
  * Authors:
  *    Eric Anholt <eric at anholt.net>
+ *    Kenneth Graunke <kenneth at whitecape.org>
+ *
+ * While documentation for performance counters is suspiciously missing from the
+ * Sandybridge PRM, they were documented in Volume 1 Part 3 of the Ironlake PRM.
+ *
+ * A lot of the Ironlake PRM actually unintentionally documents Sandybridge
+ * due to mistakes made when updating the documentation for Gen6+.  Many of
+ * these mislabeled sections carried forward to the public documentation.
+ *
+ * The Ironlake PRMs have been publicly available since 2010 and are online at:
+ * https://01.org/linuxgraphics/documentation/2010-intel-core-processor-family
  */
 
 #include <unistd.h>
+#include <stdbool.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <err.h>
@@ -71,6 +83,60 @@ const char *gen5_counter_names[GEN5_COUNTER_COUNT] = {
 	"cycles any EU is stalled for math",
 };
 
+#define GEN6_COUNTER_COUNT 29
+
+/**
+ * Sandybridge: Counter Select = 001
+ * A0   A1   A2   A3   A4   TIMESTAMP RPT_ID
+ * A5   A6   A7   A8   A9   A10  A11  A12
+ * A13  A14  A15  A16  A17  A18  A19  A20
+ * A21  A22  A23  A24  A25  A26  A27  A28
+ */
+const int gen6_counter_format = 1;
+
+/**
+ * Names for aggregating counters A0-A28.
+ *
+ * While the Ironlake PRM clearly documents that there are 29 counters (A0-A28),
+ * it only lists the names for 28 of them; one is missing.  However, careful
+ * examination reveals a pattern: there are five GS counters (Active, Stall,
+ * Core Stall, # threads loaded, and ready but not running time).  There are
+ * also five PS counters, in the same order.  But there are only four VS
+ * counters listed - the number of VS threads loaded is missing.  Presumably,
+ * it exists and is counter 5, and the rest are shifted over one place.
+ */
+const char *gen6_counter_names[GEN6_COUNTER_COUNT] = {
+	[0]  = "Aggregated Core Array Active",
+	[1]  = "Aggregated Core Array Stalled",
+	[2]  = "Vertex Shader Active Time",
+	[3]  = "Vertex Shader Stall Time",
+	[4]  = "Vertex Shader Stall Time - Core Stall",
+	[5]  = "# VS threads loaded",
+	[6]  = "Vertex Shader Ready but not running time",
+	[7]  = "Geometry Shader Active Time",
+	[8]  = "Geometry Shader Stall Time",
+	[9]  = "Geometry Shader Stall Time - Core Stall",
+	[10] = "# GS threads loaded",
+	[11] = "Geometry Shader ready but not running Time",
+	[12] = "Pixel Shader Active Time",
+	[13] = "Pixel Shader Stall Time",
+	[14] = "Pixel Shader Stall Time - Core Stall",
+	[15] = "# PS threads loaded",
+	[16] = "Pixel Shader ready but not running Time",
+	[17] = "Early Z Test Pixels Passing",
+	[18] = "Early Z Test Pixels Failing",
+	[19] = "Early Stencil Test Pixels Passing",
+	[20] = "Early Stencil Test Pixels Failing",
+	[21] = "Pixel Kill Count",
+	[22] = "Alpha Test Pixels Failed",
+	[23] = "Post PS Stencil Pixels Failed",
+	[24] = "Post PS Z buffer Pixels Failed",
+	[25] = "Pixels/samples Written in the frame buffer",
+	[26] = "GPU Busy",
+	[27] = "CL active and not stalled",
+	[28] = "SF active and stalled",
+};
+
 int have_totals = 0;
 uint32_t *totals;
 uint32_t *last_counter;
@@ -85,6 +151,20 @@ struct intel_batchbuffer *batch;
 #define MI_COUNTER_ADDRESS_GTT	(1 << 0)
 /* DW2: report ID */
 
+/**
+ * According to the Sandybridge PRM, Volume 1, Part 1, page 48,
+ * MI_REPORT_PERF_COUNT is now opcode 0x28.  The Ironlake PRM, Volume 1,
+ * Part 3 details how it works.
+ */
+/* DW0 */
+#define GEN6_MI_REPORT_PERF_COUNT (0x28 << 23)
+/* DW1 and 2 are the same as above */
+
+/* OACONTROL exists on Gen6+ but is documented in the Ironlake PRM */
+#define OACONTROL                       0x2360
+# define OACONTROL_COUNTER_SELECT_SHIFT 2
+# define PERFORMANCE_COUNTER_ENABLE     (1 << 0)
+
 static void
 gen5_get_counters(void)
 {
@@ -124,6 +204,45 @@ gen5_get_counters(void)
 	drm_intel_bo_unreference(stats_bo);
 }
 
+static void
+gen6_get_counters(void)
+{
+	int i;
+	drm_intel_bo *stats_bo;
+	uint32_t *stats_result;
+
+	/* Map from counter names to their index in the buffer object */
+	static const int buffer_index[GEN6_COUNTER_COUNT] =
+	{
+		7,   6,  5,  4,  3,
+		15, 14, 13, 12, 11, 10,  9,  8,
+		23, 22, 21, 20, 19, 18, 17, 16,
+		31, 30, 29, 28, 27, 26, 25, 24,
+	};
+
+	stats_bo = drm_intel_bo_alloc(bufmgr, "stats", 4096, 4096);
+
+	BEGIN_BATCH(3);
+	OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT | (3 - 2));
+	OUT_RELOC(stats_bo,
+		  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+		  MI_COUNTER_ADDRESS_GTT);
+	OUT_BATCH(0);
+	ADVANCE_BATCH();
+
+	intel_batchbuffer_flush_on_ring(batch, I915_EXEC_RENDER);
+
+	drm_intel_bo_map(stats_bo, 0);
+	stats_result = stats_bo->virtual;
+	for (i = 0; i < GEN6_COUNTER_COUNT; i++) {
+		totals[i] += stats_result[buffer_index[i]] - last_counter[i];
+		last_counter[i] = stats_result[buffer_index[i]];
+	}
+
+	drm_intel_bo_unmap(stats_bo);
+	drm_intel_bo_unreference(stats_bo);
+}
+
 #define STATS_CHECK_FREQUENCY	100
 #define STATS_REPORT_FREQUENCY	2
 
@@ -131,6 +250,7 @@ int
 main(int argc, char **argv)
 {
 	uint32_t devid;
+	int counter_format;
 	int counter_count;
 	const char **counter_name;
 	void (*get_counters)(void);
@@ -138,6 +258,7 @@ main(int argc, char **argv)
 	char clear_screen[] = {0x1b, '[', 'H',
 			       0x1b, '[', 'J',
 			       0x0};
+	bool oacontrol = true;
 	int fd;
 	int l;
 
@@ -152,10 +273,27 @@ main(int argc, char **argv)
 		counter_name = gen5_counter_names;
 		counter_count = GEN5_COUNTER_COUNT;
 		get_counters = gen5_get_counters;
+		oacontrol = false;
+	} else if (IS_GEN6(devid)) {
+		counter_name = gen6_counter_names;
+		counter_count = GEN6_COUNTER_COUNT;
+		counter_format = gen6_counter_format;
+		get_counters = gen6_get_counters;
 	} else {
 		printf("This tool is not yet supported on your platform.\n");
 		abort();
 	}
+
+	if (oacontrol) {
+		/* Forcewake */
+		intel_register_access_init(intel_get_pci_device(), 0);
+
+		/* Enable performance counters */
+		intel_register_write(OACONTROL,
+			counter_format << OACONTROL_COUNTER_SELECT_SHIFT |
+			PERFORMANCE_COUNTER_ENABLE);
+	}
+
 	totals = calloc(counter_count, sizeof(uint32_t));
 	last_counter = calloc(counter_count, sizeof(uint32_t));
 
@@ -180,6 +318,14 @@ main(int argc, char **argv)
 		}
 	}
 
+	if (oacontrol) {
+		/* Disable performance counters */
+		intel_register_write(OACONTROL, 0);
+
+		/* Forcewake */
+		intel_register_access_fini();
+	}
+
 	free(totals);
 	free(last_counter);
 
-- 
1.8.2



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux