+ }
+
+ return total_len;
+}
+
+
/* CAP_SYS_ADMIN is required to open system wide metrics, unless the system
* control parameter dev.i915.perf_stream_paranoid == 0 */
static void
@@ -1362,6 +1530,70 @@ print_reports(uint32_t *oa_report0, uint32_t *oa_report1, int fmt)
}
}
+/* Debug function, only useful when reports don't make sense. */
+#if 0
+static void
+print_report(uint32_t *report, int fmt)
+{
+ igt_debug("TIMESTAMP: %"PRIu32"\n", report[1]);
+
+ if (IS_HASWELL(devid) && oa_formats[fmt].n_c == 0) {
+ igt_debug("CLOCK = N/A\n");
+ } else {
+ uint32_t clock = read_report_ticks(report, fmt);
+
+ igt_debug("CLOCK: %"PRIu32"\n", clock);
+ }
+
+ if (intel_gen(devid) >= 8) {
+ uint32_t slice_freq, unslice_freq;
+ const char *reason = gen8_read_report_reason(report);
+
+ gen8_read_report_clock_ratios(report, &slice_freq, &unslice_freq);
+
+ igt_debug("SLICE CLK: %umhz\n", slice_freq);
+ igt_debug("UNSLICE CLK: %umhz\n", unslice_freq);
+ igt_debug("REASON: \"%s\"\n", reason);
+ igt_debug("CTX ID: %"PRIu32"/%"PRIx32"\n", report[2], report[2]);
+ }
+
+ /* Gen8+ has some 40bit A counters... */
+ for (int j = 0; j < oa_formats[fmt].n_a40; j++) {
+ uint64_t value = gen8_read_40bit_a_counter(report, fmt, j);
+
+ if (undefined_a_counters[j])
+ continue;
+
+ igt_debug("A%d: %"PRIu64"\n", j, value);
+ }
+
+ for (int j = 0; j < oa_formats[fmt].n_a; j++) {
+ uint32_t *a = (uint32_t *)(((uint8_t *)report) +
+ oa_formats[fmt].a_off);
+ int a_id = oa_formats[fmt].first_a + j;
+
+ if (undefined_a_counters[a_id])
+ continue;
+
+ igt_debug("A%d: %"PRIu32"\n", a_id, a[j]);
+ }
+
+ for (int j = 0; j < oa_formats[fmt].n_b; j++) {
+ uint32_t *b = (uint32_t *)(((uint8_t *)report) +
+ oa_formats[fmt].b_off);
+
+ igt_debug("B%d: %"PRIu32"\n", j, b[j]);
+ }
+
+ for (int j = 0; j < oa_formats[fmt].n_c; j++) {
+ uint32_t *c = (uint32_t *)(((uint8_t *)report) +
+ oa_formats[fmt].c_off);
+
+ igt_debug("C%d: %"PRIu32"\n", j, c[j]);
+ }
+}
+#endif
+
static void
test_oa_formats(void)
{
@@ -2486,14 +2718,8 @@ test_mi_rpc(void)
}
static void
-scratch_buf_init(drm_intel_bufmgr *bufmgr,
- struct igt_buf *buf,
- int width, int height,
- uint32_t color)
+scratch_buf_memset(drm_intel_bo *bo, int width, int height, uint32_t color)
{
- size_t stride = width * 4;
- size_t size = stride * height;
- drm_intel_bo *bo = drm_intel_bo_alloc(bufmgr, "", size, 4096);
int ret;
ret = drm_intel_bo_map(bo, true /* writable */);
@@ -2503,6 +2729,19 @@ scratch_buf_init(drm_intel_bufmgr *bufmgr,
((uint32_t *)bo->virtual)[i] = color;
drm_intel_bo_unmap(bo);
+}
+
+static void
+scratch_buf_init(drm_intel_bufmgr *bufmgr,
+ struct igt_buf *buf,
+ int width, int height,
+ uint32_t color)
+{
+ size_t stride = width * 4;
+ size_t size = stride * height;
+ drm_intel_bo *bo = drm_intel_bo_alloc(bufmgr, "", size, 4096);
+
+ scratch_buf_memset(bo, width, height, color);
buf->bo = bo;
buf->stride = stride;
@@ -2521,14 +2760,25 @@ emit_stall_timestamp_and_rpc(struct intel_batchbuffer *batch,
PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_WRITE_TIMESTAMP);
- BEGIN_BATCH(5, 1);
- OUT_BATCH(GFX_OP_PIPE_CONTROL | (5 - 2));
- OUT_BATCH(pipe_ctl_flags);
- OUT_RELOC(dst, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
- timestamp_offset);
- OUT_BATCH(0); /* imm lower */
- OUT_BATCH(0); /* imm upper */
- ADVANCE_BATCH();
+ if (intel_gen(devid) >= 8) {
+ BEGIN_BATCH(5, 1);
+ OUT_BATCH(GFX_OP_PIPE_CONTROL | (6 - 2));
+ OUT_BATCH(pipe_ctl_flags);
+ OUT_RELOC(dst, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ timestamp_offset);
+ OUT_BATCH(0); /* imm lower */
+ OUT_BATCH(0); /* imm upper */
+ ADVANCE_BATCH();
+ } else {
+ BEGIN_BATCH(5, 1);
+ OUT_BATCH(GFX_OP_PIPE_CONTROL | (5 - 2));
+ OUT_BATCH(pipe_ctl_flags);
+ OUT_RELOC(dst, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ timestamp_offset);
+ OUT_BATCH(0); /* imm lower */
+ OUT_BATCH(0); /* imm upper */
+ ADVANCE_BATCH();
+ }
emit_report_perf_count(batch, dst, report_dst_offset, report_id);
}
@@ -2574,7 +2824,7 @@ hsw_test_single_ctx_counters(void)
drm_intel_bufmgr *bufmgr;
drm_intel_context *context0, *context1;
struct intel_batchbuffer *batch;
- struct igt_buf src, dst;
+ struct igt_buf src[3], dst[3];
drm_intel_bo *bo;
uint32_t *report0_32, *report1_32;
uint64_t timestamp0_64, timestamp1_64;
@@ -2592,8 +2842,10 @@ hsw_test_single_ctx_counters(void)
bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
drm_intel_bufmgr_gem_enable_reuse(bufmgr);
- scratch_buf_init(bufmgr, &src, width, height, 0xff0000ff);
- scratch_buf_init(bufmgr, &dst, width, height, 0x00ff00ff);
+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
+ scratch_buf_init(bufmgr, &src[i], width, height, 0xff0000ff);
+ scratch_buf_init(bufmgr, &dst[i], width, height, 0x00ff00ff);
+ }
batch = intel_batchbuffer_alloc(bufmgr, devid);
@@ -2627,14 +2879,19 @@ hsw_test_single_ctx_counters(void)
*/
render_copy(batch,
context0,
- &src, 0, 0, width, height,
- &dst, 0, 0);
+ &src[0], 0, 0, width, height,
+ &dst[0], 0, 0);
ret = drm_intel_gem_context_get_id(context0, &ctx_id);
igt_assert_eq(ret, 0);
igt_assert_neq(ctx_id, 0xffffffff);
properties[1] = ctx_id;
+ intel_batchbuffer_flush_with_context(batch, context0);
+
+ scratch_buf_memset(src[0].bo, width, height, 0xff0000ff);
+ scratch_buf_memset(dst[0].bo, width, height, 0x00ff00ff);
+
igt_debug("opening i915-perf stream\n");
stream_fd = __perf_open(drm_fd, ¶m);
@@ -2661,8 +2918,8 @@ hsw_test_single_ctx_counters(void)
render_copy(batch,
context0,
- &src, 0, 0, width, height,
- &dst, 0, 0);
+ &src[0], 0, 0, width, height,
+ &dst[0], 0, 0);
/* Another redundant flush to clarify batch bo is free to reuse */
intel_batchbuffer_flush_with_context(batch, context0);
@@ -2673,13 +2930,13 @@ hsw_test_single_ctx_counters(void)
*/
render_copy(batch,
context1,
- &src, 0, 0, width, height,
- &dst, 0, 0);
+ &src[1], 0, 0, width, height,
+ &dst[1], 0, 0);
render_copy(batch,
context1,
- &src, 0, 0, width, height,
- &dst, 0, 0);
+ &src[2], 0, 0, width, height,
+ &dst[2], 0, 0);
/* And another */
intel_batchbuffer_flush_with_context(batch, context1);
@@ -2708,6 +2965,7 @@ hsw_test_single_ctx_counters(void)
/* A40 == N samples written to all render targets */
n_samples_written = report1_32[43] - report0_32[43];
+
igt_debug("n samples written = %d\n", n_samples_written);
igt_assert_eq(n_samples_written, width * height);
@@ -2742,8 +3000,10 @@ hsw_test_single_ctx_counters(void)
(delta_oa32_ns - delta_ts64_ns);
igt_assert(delta_delta <= 320);
- drm_intel_bo_unreference(src.bo);
- drm_intel_bo_unreference(dst.bo);
+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
+ drm_intel_bo_unreference(src[i].bo);
+ drm_intel_bo_unreference(dst[i].bo);
+ }
drm_intel_bo_unmap(bo);
drm_intel_bo_unreference(bo);
@@ -2757,6 +3017,452 @@ hsw_test_single_ctx_counters(void)
igt_waitchildren();
}
+/* Tests the INTEL_performance_query use case where an unprivileged process
+ * should be able to configure the OA unit for per-context metrics (for a
+ * context associated with that process' drm file descriptor) and the counters
+ * should only relate to that specific context.
+ *
+ * For Gen8+ although reports read via i915 perf can be filtered for a single
+ * context the counters themselves always progress as global/system-wide
+ * counters affected by all contexts. To support the INTEL_performance_query
+ * use case on Gen8+ it's necessary to combine OABUFFER and
+ * MI_REPORT_PERF_COUNT reports so that counter normalisation can take into
+ * account context-switch reports and factor out any counter progression not
+ * associated with the current context.
+ */
+static void
+gen8_test_single_ctx_render_target_writes_a_counter(void)
+{
+ int oa_exponent = max_oa_exponent_for_period_lte(1000000);
+ uint64_t properties[] = {
+ DRM_I915_PERF_PROP_CTX_HANDLE, UINT64_MAX, /* updated below */
+
+ /* Note: we have to specify at least one sample property even
+ * though we aren't interested in samples in this case
+ */
+ DRM_I915_PERF_PROP_SAMPLE_OA, true,
+
+ /* OA unit configuration */
+ DRM_I915_PERF_PROP_OA_METRICS_SET, test_metric_set_id,
+ DRM_I915_PERF_PROP_OA_FORMAT, test_oa_format,
+ DRM_I915_PERF_PROP_OA_EXPONENT, oa_exponent,
+
+ /* Note: no OA exponent specified in this case */
+ };
+ struct drm_i915_perf_open_param param = {
+ .flags = I915_PERF_FLAG_FD_CLOEXEC,
+ .num_properties = ARRAY_SIZE(properties) / 2,
+ .properties_ptr = to_user_pointer(properties),
+ };
+ size_t format_size = oa_formats[test_oa_format].size;
+ size_t sample_size = (sizeof(struct drm_i915_perf_record_header) +
+ format_size);
+ int max_reports = (16 * 1024 * 1024) / format_size;
+ int buf_size = sample_size * max_reports * 1.5;
+ int child_ret;
+ uint8_t *buf = malloc(buf_size);
+ ssize_t len;
+ struct igt_helper_process child = {};
+
+ /* should be default, but just to be sure... */
+ write_u64_file("/proc/sys/dev/i915/perf_stream_paranoid", 1);
+
+ do {
+
+ igt_fork_helper(&child) {
+ struct drm_i915_perf_record_header *header;
+ drm_intel_bufmgr *bufmgr;
+ drm_intel_context *context0, *context1;
+ struct intel_batchbuffer *batch;
+ struct igt_buf src[3], dst[3];
+ drm_intel_bo *bo;
+ uint32_t *report0_32, *report1_32;
+ uint32_t *prev, *lprev = NULL;
+ uint64_t timestamp0_64, timestamp1_64;
+ uint32_t delta_ts64, delta_oa32;
+ uint64_t delta_ts64_ns, delta_oa32_ns;
+ uint32_t delta_delta;
+ int width = 800;
+ int height = 600;
+ uint32_t ctx_id = 0xffffffff; /* invalid handle */
+ uint32_t ctx1_id = 0xffffffff; /* invalid handle */
+ uint32_t current_ctx_id = 0xffffffff;
+ uint32_t n_invalid_ctx = 0;
+ int ret;
+ struct accumulator accumulator = {
+ .format = test_oa_format
+ };
+
+ bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
+ drm_intel_bufmgr_gem_enable_reuse(bufmgr);
+
+ for (int i = 0; i < ARRAY_SIZE(src); i++) {
+ scratch_buf_init(bufmgr, &src[i], width, height, 0xff0000ff);
+ scratch_buf_init(bufmgr, &dst[i], width, height, 0x00ff00ff);
+ }
+
+ batch = intel_batchbuffer_alloc(bufmgr, devid);
+
+ context0 = drm_intel_gem_context_create(bufmgr);
+ igt_assert(context0);
+
+ context1 = drm_intel_gem_context_create(bufmgr);
+ igt_assert(context1);
+
+ igt_debug("submitting warm up render_copy\n");
+
+ /* Submit some early, unmeasured, work to the context we want
+ * to measure to try and catch issues with i915-perf
+ * initializing the HW context ID for filtering.
+ *
+ * We do this because i915-perf single context filtering had
+ * previously only relied on a hook into context pinning to
+ * initialize the HW context ID, instead of also trying to
+ * determine the HW ID while opening the stream, in case it
+ * has already been pinned.
+ *
+ * This wasn't noticed by the previous unit test because we
+ * were opening the stream while the context hadn't been
+ * touched or pinned yet and so it worked out correctly to wait
+ * for the pinning hook.
+ *
+ * Now a buggy version of i915-perf will fail to measure
+ * anything for context0 once this initial render_copy() ends
+ * up pinning the context since there won't ever be a pinning
+ * hook callback.
+ */
+ render_copy(batch,
+ context0,
+ &src[0], 0, 0, width, height,
+ &dst[0], 0, 0);
+
+ ret = drm_intel_gem_context_get_id(context0, &ctx_id);
+ igt_assert_eq(ret, 0);
+ igt_assert_neq(ctx_id, 0xffffffff);
+ properties[1] = ctx_id;
+
+ scratch_buf_memset(src[0].bo, width, height, 0xff0000ff);
+ scratch_buf_memset(dst[0].bo, width, height, 0x00ff00ff);
+
+ igt_debug("opening i915-perf stream\n");
+ stream_fd = __perf_open(drm_fd, ¶m);
+
+ bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);