[PATCH igt] igt/perf: Busywait for MI_REPORT_PERF_COUNT results

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> · Fri, 8 Dec 2017 14:31:51 +0000

On Haswell, at least, MI_REPORT_PERF_COUNT is not flushed by the
PIPECONTROL surrounding the batch. (In theory, before the breadcrumb is
updated the CPU's view of memory is coherent with the GPU, i.e. all
writes have landed and are visible to userspace. This does not appear to
be the case for MI_REPORT_PERF_COUNT.)

As MI_RPC does not apear to be synchronized with the batch, busyspin for
its completion.

(This has far deeper implications; since it means the GPU can still be
writing to memory after release.)

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx>
Cc: Matthew Auld <matthew.auld@xxxxxxxxx>
---
 tests/perf.c | 93 +++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 52 insertions(+), 41 deletions(-)

diff --git a/tests/perf.c b/tests/perf.c
index a161c45d7..8c20fbe09 100644
--- a/tests/perf.c
+++ b/tests/perf.c
@@ -706,47 +706,59 @@ emit_report_perf_count(struct intel_batchbuffer *batch,
 }
 
 static uint32_t
-i915_get_one_gpu_timestamp(uint32_t *context_id)
+i915_get_one_gpu_timestamp(void)
 {
-	drm_intel_bufmgr *bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
-	drm_intel_context *mi_rpc_ctx = drm_intel_gem_context_create(bufmgr);
-	drm_intel_bo *mi_rpc_bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
-	struct intel_batchbuffer *mi_rpc_batch = intel_batchbuffer_alloc(bufmgr, devid);
-	int ret;
-	uint32_t timestamp;
-
-	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
-
-	if (context_id) {
-		ret = drm_intel_gem_context_get_id(mi_rpc_ctx, context_id);
-		igt_assert_eq(ret, 0);
-	}
-
-	igt_assert(mi_rpc_ctx);
-	igt_assert(mi_rpc_bo);
-	igt_assert(mi_rpc_batch);
-
-	ret = drm_intel_bo_map(mi_rpc_bo, true);
-	igt_assert_eq(ret, 0);
-	memset(mi_rpc_bo->virtual, 0x80, 4096);
-	drm_intel_bo_unmap(mi_rpc_bo);
-
-	emit_report_perf_count(mi_rpc_batch,
-			       mi_rpc_bo, /* dst */
-			       0, /* dst offset in bytes */
-			       0xdeadbeef); /* report ID */
+	struct drm_i915_gem_execbuffer2 execbuf;
+	struct drm_i915_gem_exec_object2 obj[2];
+	struct drm_i915_gem_relocation_entry reloc;
+	uint32_t *ptr, timestamp;
+	struct timespec tv = {};
+	int i;
 
-	intel_batchbuffer_flush_with_context(mi_rpc_batch, mi_rpc_ctx);
+	memset(obj, 0, sizeof(obj));
+	obj[0].handle = gem_create(drm_fd, 4096);
+	ptr = gem_mmap__cpu(drm_fd, obj[0].handle, 0, 4096, PROT_WRITE);
+	memset(ptr, 0x80, 4096);
+	munmap(ptr, 4096);
+
+	obj[1].handle = gem_create(drm_fd, 4096);
+	obj[1].relocs_ptr = to_user_pointer(&reloc);
+	obj[1].relocation_count = 1;
+	ptr = gem_mmap__cpu(drm_fd, obj[1].handle, 0, 4096, PROT_WRITE);
+
+	memset(&reloc, 0, sizeof(reloc));
+	reloc.target_handle = obj[0].handle;
+	reloc.offset = sizeof(uint32_t);
+	reloc.read_domains = I915_GEM_DOMAIN_RENDER;
+	reloc.write_domain = I915_GEM_DOMAIN_RENDER;
+
+	i = 2;
+	ptr[0] = GEN6_MI_REPORT_PERF_COUNT;
+	if (intel_gen(devid) >= 8)
+		ptr[0]++, i++; /* 64b reloc */
+	ptr[i++] = 0xdeadbeef;
+	ptr[i] = MI_BATCH_BUFFER_END;
+	munmap(ptr, 4096);
+
+	memset(&execbuf, 0, sizeof(execbuf));
+	execbuf.buffers_ptr = to_user_pointer(obj);
+	execbuf.buffer_count = 2;
+	execbuf.batch_len = 4096;
+	gem_execbuf(drm_fd, &execbuf);
+	gem_close(drm_fd, obj[1].handle);
 
-	ret = drm_intel_bo_map(mi_rpc_bo, false /* write enable */);
-	igt_assert_eq(ret, 0);
-	timestamp = ((uint32_t *)mi_rpc_bo->virtual)[1];
-	drm_intel_bo_unmap(mi_rpc_bo);
+	/*
+	 * MI_REPORT_PERF_COUNT is unserialised, i.e. not flushed by
+	 * the PIPECONTROLs surrounding batch execution. Ergo, we must
+	 * manually wait.
+	 */
+	do {
+		gem_read(drm_fd, obj[0].handle, sizeof(uint32_t),
+			 &timestamp, sizeof(timestamp));
+	} while (timestamp == 0x80808080 && !igt_seconds_elapsed(&tv));
+	gem_close(drm_fd, obj[0].handle);
 
-	drm_intel_bo_unreference(mi_rpc_bo);
-	intel_batchbuffer_free(mi_rpc_batch);
-	drm_intel_gem_context_destroy(mi_rpc_ctx);
-	drm_intel_bufmgr_destroy(bufmgr);
+	igt_assert_neq(timestamp, 0x80808080);
 
 	return timestamp;
 }
@@ -1915,7 +1927,6 @@ test_oa_exponents(void)
 			uint32_t n_reports = 0;
 			uint32_t n_idle_reports = 0;
 			uint32_t n_reads = 0;
-			uint32_t context_id;
 			uint64_t first_timestamp = 0;
 			bool check_first_timestamp = true;
 			struct drm_i915_perf_record_header *header;
@@ -1944,7 +1955,7 @@ test_oa_exponents(void)
 			 * first timestamp as way to filter previously
 			 * scheduled work that would have configured
 			 * the OA unit at a different period. */
-			first_timestamp = i915_get_one_gpu_timestamp(&context_id);
+			first_timestamp = i915_get_one_gpu_timestamp();
 
 			while (n_reads < ARRAY_SIZE(reads) &&
 			       n_reports < ARRAY_SIZE(reports)) {
@@ -2070,8 +2081,8 @@ test_oa_exponents(void)
 				uint32_t *rpt = NULL, *last = NULL, *last_periodic = NULL;
 
 				igt_debug(" > More than 5%% error: avg_ts_delta = %"PRIu64", delta_delta = %"PRIu64", "
-					  "expected_delta = %"PRIu64", first_timestamp = %"PRIu64" ctx_id=%"PRIu32"\n",
-					  average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp, context_id);
+					  "expected_delta = %"PRIu64", first_timestamp = %"PRIu64"\n",
+					  average_timestamp_delta, delta_delta, expected_timestamp_delta, first_timestamp);
 				for (int i = 0; i < (n_reports - 1); i++) {
 					/* XXX: calculating with u32 arithmetic to account for overflow */
 					uint32_t u32_delta =
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx