On Tue, Mar 26, 2013 at 10:06:39PM -0700, Kenneth Graunke wrote: > While the Sandybridge PRM doesn't have any documentation on the GPU's > performance counters, a lot of information can be gleaned from the older > Ironlake PRM. Oddly, none of the information documented there actually > appears to apply to Ironlake. However, it apparently works just great > on Sandybridge. > > Since this information has all been publicly available on the internet > for around three years, we can use it. > > Signed-off-by: Kenneth Graunke <kenneth at whitecape.org> Merged, thanks for the patches. -Daniel > --- > tools/intel_perf_counters.c | 146 ++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 146 insertions(+) > > diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c > index fd268b1..b528361 100644 > --- a/tools/intel_perf_counters.c > +++ b/tools/intel_perf_counters.c > @@ -22,9 +22,21 @@ > * > * Authors: > * Eric Anholt <eric at anholt.net> > + * Kenneth Graunke <kenneth at whitecape.org> > + * > + * While documentation for performance counters is suspiciously missing from the > + * Sandybridge PRM, they were documented in Volume 1 Part 3 of the Ironlake PRM. > + * > + * A lot of the Ironlake PRM actually unintentionally documents Sandybridge > + * due to mistakes made when updating the documentation for Gen6+. Many of > + * these mislabeled sections carried forward to the public documentation. > + * > + * The Ironlake PRMs have been publicly available since 2010 and are online at: > + * https://01.org/linuxgraphics/documentation/2010-intel-core-processor-family > */ > > #include <unistd.h> > +#include <stdbool.h> > #include <stdlib.h> > #include <stdio.h> > #include <err.h> > @@ -71,6 +83,60 @@ const char *gen5_counter_names[GEN5_COUNTER_COUNT] = { > "cycles any EU is stalled for math", > }; > > +#define GEN6_COUNTER_COUNT 29 > + > +/** > + * Sandybridge: Counter Select = 001 > + * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID > + * A5 A6 A7 A8 A9 A10 A11 A12 > + * A13 A14 A15 A16 A17 A18 A19 A20 > + * A21 A22 A23 A24 A25 A26 A27 A28 > + */ > +const int gen6_counter_format = 1; > + > +/** > + * Names for aggregating counters A0-A28. > + * > + * While the Ironlake PRM clearly documents that there are 29 counters (A0-A28), > + * it only lists the names for 28 of them; one is missing. However, careful > + * examination reveals a pattern: there are five GS counters (Active, Stall, > + * Core Stall, # threads loaded, and ready but not running time). There are > + * also five PS counters, in the same order. But there are only four VS > + * counters listed - the number of VS threads loaded is missing. Presumably, > + * it exists and is counter 5, and the rest are shifted over one place. > + */ > +const char *gen6_counter_names[GEN6_COUNTER_COUNT] = { > + [0] = "Aggregated Core Array Active", > + [1] = "Aggregated Core Array Stalled", > + [2] = "Vertex Shader Active Time", > + [3] = "Vertex Shader Stall Time", > + [4] = "Vertex Shader Stall Time - Core Stall", > + [5] = "# VS threads loaded", > + [6] = "Vertex Shader Ready but not running time", > + [7] = "Geometry Shader Active Time", > + [8] = "Geometry Shader Stall Time", > + [9] = "Geometry Shader Stall Time - Core Stall", > + [10] = "# GS threads loaded", > + [11] = "Geometry Shader ready but not running Time", > + [12] = "Pixel Shader Active Time", > + [13] = "Pixel Shader Stall Time", > + [14] = "Pixel Shader Stall Time - Core Stall", > + [15] = "# PS threads loaded", > + [16] = "Pixel Shader ready but not running Time", > + [17] = "Early Z Test Pixels Passing", > + [18] = "Early Z Test Pixels Failing", > + [19] = "Early Stencil Test Pixels Passing", > + [20] = "Early Stencil Test Pixels Failing", > + [21] = "Pixel Kill Count", > + [22] = "Alpha Test Pixels Failed", > + [23] = "Post PS Stencil Pixels Failed", > + [24] = "Post PS Z buffer Pixels Failed", > + [25] = "Pixels/samples Written in the frame buffer", > + [26] = "GPU Busy", > + [27] = "CL active and not stalled", > + [28] = "SF active and stalled", > +}; > + > int have_totals = 0; > uint32_t *totals; > uint32_t *last_counter; > @@ -85,6 +151,20 @@ struct intel_batchbuffer *batch; > #define MI_COUNTER_ADDRESS_GTT (1 << 0) > /* DW2: report ID */ > > +/** > + * According to the Sandybridge PRM, Volume 1, Part 1, page 48, > + * MI_REPORT_PERF_COUNT is now opcode 0x28. The Ironlake PRM, Volume 1, > + * Part 3 details how it works. > + */ > +/* DW0 */ > +#define GEN6_MI_REPORT_PERF_COUNT (0x28 << 23) > +/* DW1 and 2 are the same as above */ > + > +/* OACONTROL exists on Gen6+ but is documented in the Ironlake PRM */ > +#define OACONTROL 0x2360 > +# define OACONTROL_COUNTER_SELECT_SHIFT 2 > +# define PERFORMANCE_COUNTER_ENABLE (1 << 0) > + > static void > gen5_get_counters(void) > { > @@ -124,6 +204,45 @@ gen5_get_counters(void) > drm_intel_bo_unreference(stats_bo); > } > > +static void > +gen6_get_counters(void) > +{ > + int i; > + drm_intel_bo *stats_bo; > + uint32_t *stats_result; > + > + /* Map from counter names to their index in the buffer object */ > + static const int buffer_index[GEN6_COUNTER_COUNT] = > + { > + 7, 6, 5, 4, 3, > + 15, 14, 13, 12, 11, 10, 9, 8, > + 23, 22, 21, 20, 19, 18, 17, 16, > + 31, 30, 29, 28, 27, 26, 25, 24, > + }; > + > + stats_bo = drm_intel_bo_alloc(bufmgr, "stats", 4096, 4096); > + > + BEGIN_BATCH(3); > + OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT | (3 - 2)); > + OUT_RELOC(stats_bo, > + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, > + MI_COUNTER_ADDRESS_GTT); > + OUT_BATCH(0); > + ADVANCE_BATCH(); > + > + intel_batchbuffer_flush_on_ring(batch, I915_EXEC_RENDER); > + > + drm_intel_bo_map(stats_bo, 0); > + stats_result = stats_bo->virtual; > + for (i = 0; i < GEN6_COUNTER_COUNT; i++) { > + totals[i] += stats_result[buffer_index[i]] - last_counter[i]; > + last_counter[i] = stats_result[buffer_index[i]]; > + } > + > + drm_intel_bo_unmap(stats_bo); > + drm_intel_bo_unreference(stats_bo); > +} > + > #define STATS_CHECK_FREQUENCY 100 > #define STATS_REPORT_FREQUENCY 2 > > @@ -131,6 +250,7 @@ int > main(int argc, char **argv) > { > uint32_t devid; > + int counter_format; > int counter_count; > const char **counter_name; > void (*get_counters)(void); > @@ -138,6 +258,7 @@ main(int argc, char **argv) > char clear_screen[] = {0x1b, '[', 'H', > 0x1b, '[', 'J', > 0x0}; > + bool oacontrol = true; > int fd; > int l; > > @@ -152,10 +273,27 @@ main(int argc, char **argv) > counter_name = gen5_counter_names; > counter_count = GEN5_COUNTER_COUNT; > get_counters = gen5_get_counters; > + oacontrol = false; > + } else if (IS_GEN6(devid)) { > + counter_name = gen6_counter_names; > + counter_count = GEN6_COUNTER_COUNT; > + counter_format = gen6_counter_format; > + get_counters = gen6_get_counters; > } else { > printf("This tool is not yet supported on your platform.\n"); > abort(); > } > + > + if (oacontrol) { > + /* Forcewake */ > + intel_register_access_init(intel_get_pci_device(), 0); > + > + /* Enable performance counters */ > + intel_register_write(OACONTROL, > + counter_format << OACONTROL_COUNTER_SELECT_SHIFT | > + PERFORMANCE_COUNTER_ENABLE); > + } > + > totals = calloc(counter_count, sizeof(uint32_t)); > last_counter = calloc(counter_count, sizeof(uint32_t)); > > @@ -180,6 +318,14 @@ main(int argc, char **argv) > } > } > > + if (oacontrol) { > + /* Disable performance counters */ > + intel_register_write(OACONTROL, 0); > + > + /* Forcewake */ > + intel_register_access_fini(); > + } > + > free(totals); > free(last_counter); > > -- > 1.8.2 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx at lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Daniel Vetter Software Engineer, Intel Corporation +41 (0) 79 365 57 48 - http://blog.ffwll.ch