New issues that were discovered while making the tests work on Gen8+ :
- we need to measure timings between periodic reports and discard all
other kind of reports
- it seems periodicity of the reports can be affected outside of RC6
(frequency change), we can detect this by looking at the amount of
clock cycles per timestamp deltas
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx>
---
tests/perf.c | 734 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 600 insertions(+), 134 deletions(-)
diff --git a/tests/perf.c b/tests/perf.c
index ca69440d..e54a14ed 100644
--- a/tests/perf.c
+++ b/tests/perf.c
@@ -28,6 +28,7 @@
#include <fcntl.h>
#include <inttypes.h>
#include <errno.h>
+#include <signal.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/times.h>
@@ -306,6 +307,25 @@ static uint32_t (*read_report_ticks)(uint32_t *report,
static void (*sanity_check_reports)(uint32_t *oa_report0, uint32_t *oa_report1,
enum drm_i915_oa_format format);
+static bool
+timestamp_delta_within(uint32_t delta,
+ uint32_t expected_delta,
+ uint32_t margin)
+{
+ return delta >= (expected_delta - margin) &&
+ delta <= (expected_delta + margin);
+}
+
+static bool
+double_value_within(double value,
+ double expected,
+ double percent_margin)
+{
+ return value >= (expected - expected * percent_margin / 100.0) &&
+ value <= (expected + expected * percent_margin / 100.0);
+
+}
+
static void
__perf_close(int fd)
{
@@ -472,6 +492,20 @@ gen8_read_report_ticks(uint32_t *report, enum drm_i915_oa_format format)
return report[3];
}
+static void
+gen8_read_report_clock_ratios(uint32_t *report,
+ uint32_t *slice_freq_mhz,
+ uint32_t *unslice_freq_mhz)
+{
+ uint32_t unslice_freq = report[0] & 0x1ff;
+ uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
+ uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
+ uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
+
+ *slice_freq_mhz = (slice_freq * 16666) / 1000;
+ *unslice_freq_mhz = (unslice_freq * 16666) / 1000;
+}
+
static const char *
gen8_read_report_reason(const uint32_t *report)
{
@@ -494,29 +528,6 @@ gen8_read_report_reason(const uint32_t *report)
return "unknown";
}
-static bool
-oa_report_is_periodic(uint32_t oa_exponent, const uint32_t *report)
-{
- if (IS_HASWELL(devid)) {
- /* For Haswell we don't have a documented report reason field
- * (though empirically report[0] bit 10 does seem to correlate
- * with a timer trigger reason) so we instead infer which
- * reports are timer triggered by checking if the least
- * significant bits are zero and the exponent bit is set.
- */
- uint32_t oa_exponent_mask = (1 << (oa_exponent + 1)) - 1;
-
- if ((report[1] & oa_exponent_mask) != (1 << oa_exponent))
- return true;
- } else {
- if ((report[0] >> OAREPORT_REASON_SHIFT) &
- OAREPORT_REASON_TIMER)
- return true;
- }
-
- return false;
-}
-
static uint64_t
timebase_scale(uint32_t u32_delta)
{
@@ -563,6 +574,29 @@ oa_exponent_to_ns(int exponent)
return 1000000000ULL * (2ULL << exponent) / timestamp_frequency;
}
+static bool
+oa_report_is_periodic(uint32_t oa_exponent, const uint32_t *report)
+{
+ if (IS_HASWELL(devid)) {
+ /* For Haswell we don't have a documented report reason field
+ * (though empirically report[0] bit 10 does seem to correlate
+ * with a timer trigger reason) so we instead infer which
+ * reports are timer triggered by checking if the least
+ * significant bits are zero and the exponent bit is set.
+ */
+ uint32_t oa_exponent_mask = (1 << (oa_exponent + 1)) - 1;
+
+ if ((report[1] & oa_exponent_mask) == (1 << oa_exponent))
+ return true;
+ } else {
+ if ((report[0] >> OAREPORT_REASON_SHIFT) &
+ OAREPORT_REASON_TIMER)
+ return true;
+ }
+
+ return false;
+}
+
static bool
oa_report_ctx_is_valid(uint32_t *report)
{
@@ -578,6 +612,128 @@ oa_report_ctx_is_valid(uint32_t *report)
igt_assert(!"reached");
}
+static uint32_t
+oa_report_get_ctx_id(uint32_t *report)
+{
+ if (!oa_report_ctx_is_valid(report))
+ return 0xffffffff;
+ return report[2];
+}
+
+static double
+oa_reports_tick_per_period(uint32_t *report0, uint32_t *report1)
+{
+ if (intel_gen(devid) < 8)
+ return 0.0;
+
+ /* Measure the number GPU tick delta to timestamp delta. */
+ return (double) (report1[3] - report0[3]) /
+ (double) (report1[1] - report0[1]);
+}
+
+static void
+scratch_buf_memset(drm_intel_bo *bo, int width, int height, uint32_t color)
+{
+ int ret;
+
+ ret = drm_intel_bo_map(bo, true /* writable */);
+ igt_assert_eq(ret, 0);
+
+ for (int i = 0; i < width * height; i++)
+ ((uint32_t *)bo->virtual)[i] = color;
+
+ drm_intel_bo_unmap(bo);
+}
+
+static void
+scratch_buf_init(drm_intel_bufmgr *bufmgr,
+ struct igt_buf *buf,
+ int width, int height,
+ uint32_t color)
+{
+ size_t stride = width * 4;
+ size_t size = stride * height;
+ drm_intel_bo *bo = drm_intel_bo_alloc(bufmgr, "", size, 4096);
+
+ scratch_buf_memset(bo, width, height, color);
+
+ buf->bo = bo;
+ buf->stride = stride;
+ buf->tiling = I915_TILING_NONE;
+ buf->size = size;
+}
+
+static void
+emit_report_perf_count(struct intel_batchbuffer *batch,
+ drm_intel_bo *dst_bo,
+ int dst_offset,
+ uint32_t report_id)
+{
+ if (IS_HASWELL(devid)) {
+ BEGIN_BATCH(3, 1);
+ OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT);
+ OUT_RELOC(dst_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ dst_offset);
+ OUT_BATCH(report_id);
+ ADVANCE_BATCH();
+ } else {
+ /* XXX: NB: n dwords arg is actually magic since it internally
+ * automatically accounts for larger addresses on gen >= 8...
+ */
+ BEGIN_BATCH(3, 1);
+ OUT_BATCH(GEN8_MI_REPORT_PERF_COUNT);
+ OUT_RELOC(dst_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ dst_offset);
+ OUT_BATCH(report_id);
+ ADVANCE_BATCH();
+ }
+}
+
+static uint32_t
+i915_get_one_gpu_timestamp(uint32_t *context_id)
+{
+ drm_intel_bufmgr *bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
+ drm_intel_context *mi_rpc_ctx = drm_intel_gem_context_create(bufmgr);
+ drm_intel_bo *mi_rpc_bo = drm_intel_bo_alloc(bufmgr, "mi_rpc dest bo", 4096, 64);
+ struct intel_batchbuffer *mi_rpc_batch = intel_batchbuffer_alloc(bufmgr, devid);
+ int ret;
+ uint32_t timestamp;
+
+ drm_intel_bufmgr_gem_enable_reuse(bufmgr);
+
+ if (context_id) {
+ ret = drm_intel_gem_context_get_id(mi_rpc_ctx, context_id);
+ igt_assert_eq(ret, 0);
+ }
+
+ igt_assert(mi_rpc_ctx);
+ igt_assert(mi_rpc_bo);
+ igt_assert(mi_rpc_batch);
+
+ ret = drm_intel_bo_map(mi_rpc_bo, true);
+ igt_assert_eq(ret, 0);
+ memset(mi_rpc_bo->virtual, 0x80, 4096);
+ drm_intel_bo_unmap(mi_rpc_bo);
+
+ emit_report_perf_count(mi_rpc_batch,
+ mi_rpc_bo, /* dst */
+ 0, /* dst offset in bytes */
+ 0xdeadbeef); /* report ID */
+
+ intel_batchbuffer_flush_with_context(mi_rpc_batch, mi_rpc_ctx);
+
+ ret = drm_intel_bo_map(mi_rpc_bo, false /* write enable */);
+ igt_assert_eq(ret, 0);
+ timestamp = ((uint32_t *)mi_rpc_bo->virtual)[1];
+ drm_intel_bo_unmap(mi_rpc_bo);
+
+ drm_intel_bo_unreference(mi_rpc_bo);
+ intel_batchbuffer_free(mi_rpc_batch);
+ drm_intel_gem_context_destroy(mi_rpc_ctx);
+ drm_intel_bufmgr_destroy(bufmgr);
+
+ return timestamp;
+}
static void
hsw_sanity_check_render_basic_reports(uint32_t *oa_report0, uint32_t *oa_report1,
@@ -1032,7 +1188,6 @@ i915_read_reports_until_timestamp(enum drm_i915_oa_format oa_format,
return total_len;
}
-
/* CAP_SYS_ADMIN is required to open system wide metrics, unless the system
* control parameter dev.i915.perf_stream_paranoid == 0 */
static void
@@ -1347,20 +1502,6 @@ open_and_read_2_oa_reports(int format_id,
__perf_close(stream_fd);
}
-static void
-gen8_read_report_clock_ratios(uint32_t *report,
- uint32_t *slice_freq_mhz,
- uint32_t *unslice_freq_mhz)
-{
- uint32_t unslice_freq = report[0] & 0x1ff;
- uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
- uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
- uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
-
- *slice_freq_mhz = (slice_freq * 16666) / 1000;
- *unslice_freq_mhz = (unslice_freq * 16666) / 1000;
-}
-
static void
print_reports(uint32_t *oa_report0, uint32_t *oa_report1, int fmt)
{
@@ -1552,74 +1693,457 @@ test_oa_formats(void)
}
}
+
+enum load {
+ LOW,
+ HIGH
+};
+
+#define LOAD_HELPER_PAUSE_USEC 500
+
+static struct load_helper {
+ int devid;
+ int has_ppgtt;
+ drm_intel_bufmgr *bufmgr;
+ drm_intel_context *context;
+ uint32_t context_id;
+ struct intel_batchbuffer *batch;
+ drm_intel_bo *target_buffer;
+
+static void load_helper_signal_handler(int sig)
+{
+ if (sig == SIGUSR2)
+ lh.load = lh.load == LOW ? HIGH : LOW;
+ else
+ lh.exit = true;
+}
+
+static void load_helper_set_load(enum load load)
+{
+ igt_assert(lh.igt_proc.running);
+
+ if (lh.load == load)
+ return;
+
+ lh.load = load;
+ kill(lh.igt_proc.pid, SIGUSR2);
+}
+
+static void load_helper_run(enum load load)
+{
+ /*
+ * FIXME fork helpers won't get cleaned up when started from within a
+ * subtest, so handle the case where it sticks around a bit too long.
+ */
+ if (lh.igt_proc.running) {
+ load_helper_set_load(load);
+ return;
+ }
+
+ lh.load = load;
+
+ igt_fork_helper(&lh.igt_proc) {
+ signal(SIGUSR1, load_helper_signal_handler);
+ signal(SIGUSR2, load_helper_signal_handler);
+
+ while (!lh.exit) {
+ int ret;
+
+ render_copy(lh.batch,
+ lh.context,
+ &lh.src, 0, 0, 1920, 1080,
+ &lh.dst, 0, 0);
+
+ intel_batchbuffer_flush_with_context(lh.batch,
+ lh.context);
+
+ ret = drm_intel_gem_context_get_id(lh.context,
+ &lh.context_id);
+ igt_assert_eq(ret, 0);
+
+ drm_intel_bo_wait_rendering(lh.dst.bo);
+
+ /* Lower the load by pausing after every submitted
+ * write. */
+ if (lh.load == LOW)
+ usleep(LOAD_HELPER_PAUSE_USEC);
+ }
+ }
+}
+
+static void load_helper_stop(void)
+{
+ kill(lh.igt_proc.pid, SIGUSR1);
+ igt_assert(igt_wait_helper(&lh.igt_proc) == 0);
+}
+
+static void load_helper_init(void)
+{
+ int ret;
+
+ lh.devid = intel_get_drm_devid(drm_fd);
+ lh.has_ppgtt = gem_uses_ppgtt(drm_fd);
+
+ /* MI_STORE_DATA can only use GTT address on gen4+/g33 and needs
+ * snoopable mem on pre-gen6. Hence load-helper only works on gen6+, but
+ * that's also all we care about for the rps testcase*/
+ igt_assert(intel_gen(lh.devid) >= 6);
+ lh.bufmgr = drm_intel_bufmgr_gem_init(drm_fd, 4096);
+ igt_assert(lh.bufmgr);
+
+ drm_intel_bufmgr_gem_enable_reuse(lh.bufmgr);
+
+ lh.context = drm_intel_gem_context_create(lh.bufmgr);
+ igt_assert(lh.context);
+
+ lh.context_id = 0xffffffff;
+ ret = drm_intel_gem_context_get_id(lh.context, &lh.context_id);
+ igt_assert_eq(ret, 0);
+ igt_assert_neq(lh.context_id, 0xffffffff);
+
+ lh.batch = intel_batchbuffer_alloc(lh.bufmgr, lh.devid);
+ igt_assert(lh.batch);
+
+ scratch_buf_init(lh.bufmgr, &lh.dst, 1920, 1080, 0);
+ scratch_buf_init(lh.bufmgr, &lh.src, 1920, 1080, 0);
+}
+
+static void load_helper_fini(void)
+{
+ if (lh.igt_proc.running)
+ load_helper_stop();
+
+ if (lh.src.bo)
+ drm_intel_bo_unreference(lh.src.bo);
+ if (lh.dst.bo)
+ drm_intel_bo_unreference(lh.dst.bo);
+
+ if (lh.batch)
+ intel_batchbuffer_free(lh.batch);
+
+ if (lh.context)
+ drm_intel_gem_context_destroy(lh.context);
+
+ if (lh.bufmgr)
+ drm_intel_bufmgr_destroy(lh.bufmgr);
+}
+
static void
test_oa_exponents(void)
{
- igt_debug("Testing OA timer exponents\n");
+ load_helper_init();
+ load_helper_run(HIGH);
/* It's asking a lot to sample with a 160 nanosecond period and the
* test can fail due to buffer overflows if it wasn't possible to
* keep up, so we don't start from an exponent of zero...
*/
- for (int i = 5; i < 20; i++) {
- uint32_t expected_timestamp_delta;
- uint32_t timestamp_delta;
- uint32_t oa_report0[64];
- uint32_t oa_report1[64];
+ for (int exponent = 5; exponent < 18; exponent++) {
+ uint64_t expected_timestamp_delta;
uint32_t time_delta;
- uint32_t clock_delta;
- uint32_t freq;
int n_tested = 0;
+ int n_time_delta_matches = 0;
/* The exponent is effectively selecting a bit in the timestamp
* to trigger reports on and so in practice we expect the raw
* timestamp deltas for periodic reports to exactly match the
* value of next bit.
*/
- expected_timestamp_delta = 2 << i;
+ expected_timestamp_delta = 2UL << exponent;
for (int j = 0; n_tested < 10 && j < 100; j++) {
- uint32_t ticks0, ticks1;
-
- igt_debug("ITER %d: testing OA exponent %d (period = %"PRIu64"ns)\n",
- j, i, oa_exponent_to_ns(i));
-
- open_and_read_2_oa_reports(test_oa_format,
- i, /* exponent */
- oa_report0,
- oa_report1,
- true); /* timer triggered
- reports only */
-
- timestamp_delta = oa_report1[1] - oa_report0[1];
- igt_assert_neq(timestamp_delta, 0);
-
- if (timestamp_delta != expected_timestamp_delta) {
- igt_debug("timestamp0 = %u/0x%x\n",
- oa_report0[1], oa_report0[1]);
- igt_debug("timestamp1 = %u/0x%x\n",
- oa_report1[1], oa_report1[1]);
+ uint64_t properties[] = {
+ /* Include OA reports in samples */
+ DRM_I915_PERF_PROP_SAMPLE_OA, true,
+
+ /* OA unit configuration */
+ DRM_I915_PERF_PROP_OA_METRICS_SET, test_metric_set_id,
+ DRM_I915_PERF_PROP_OA_FORMAT, test_oa_format,
+ DRM_I915_PERF_PROP_OA_EXPONENT, exponent,
+ };
+ struct drm_i915_perf_open_param param = {
+ .flags = I915_PERF_FLAG_FD_CLOEXEC,
+ .num_properties = ARRAY_SIZE(properties) / 2,
+ .properties_ptr = to_user_pointer(properties),
+ };
+ int ret;
+ uint64_t average_timestamp_delta;
+ uint32_t n_reports = 0;
+ uint32_t n_report_lost = 0;
+ uint32_t n_idle_reports = 0;
+ uint32_t n_reads = 0;
+ uint32_t context_id;
+ uint64_t first_timestamp = 0;
+ bool check_first_timestamp = true;
+ struct drm_i915_perf_record_header *header;
+ uint64_t delta_delta;
+ struct {
+ uint32_t report[64];
+ } reports[30];
+ struct {
+ uint8_t *buf;
+ size_t len;
+ } reads[1000];
+ double error;
+ double tick_per_period;
+
+ igt_debug("ITER %d: testing OA exponent %d,"
+ " expected ts delta = %"PRIu64" (%"PRIu64"ns/%.2fus/%.2fms)\n",
+ j, exponent,
+ expected_timestamp_delta,
+ oa_exponent_to_ns(exponent),
+ oa_exponent_to_ns(exponent) / 1000.0,
+ oa_exponent_to_ns(exponent) / (1000.0 * 1000.0));
+
+ stream_fd = __perf_open(drm_fd, ¶m);
+
+ /* Right after opening the OA stream, read a
+ * first timestamp as way to filter previously
+ * scheduled work that would have configured
+ * the OA unit at a different period. */
+ first_timestamp = i915_get_one_gpu_timestamp(&context_id);
+
+ while (n_reads < ARRAY_SIZE(reads) &&
+ n_reports < ARRAY_SIZE(reports)) {
+ const size_t buf_size = 1024 * 1024;
+ uint8_t *buf = reads[n_reads++].buf = calloc(1, buf_size);
+
+ while ((ret = read(stream_fd, buf, buf_size)) < 0 &&
+ errno == EINTR)
+ ;
+
+ /* We should never have no data. */
+ igt_assert(ret > 0);
+ reads[n_reads - 1].len = ret;
+
+ igt_debug(" > read %i bytes\n", ret);
+
+ for (int offset = 0;
+ offset < ret && n_reports < ARRAY_SIZE(reports);
+ offset += header->size) {
+ uint32_t *report;
+ double previous_tick_per_period;
+
+ header = (void *)(buf + offset);
+
+ if (header->type == DRM_I915_PERF_RECORD_OA_BUFFER_LOST) {
+ igt_assert(!"reached");
+ break;
+ }
+
+ if (header->type == DRM_I915_PERF_RECORD_OA_REPORT_LOST) {
+ n_report_lost++;
+ n_reports = 0;
+ n_report_lost = 0;