Re: [PATCH i-g-t 5/5] tests/perf_pmu: Tests for i915 PMU API

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Tue, 19 Sep 2017 09:37:35 +0100

On 18/09/2017 14:17, Chris Wilson wrote:
Quoting Tvrtko Ursulin (2017-09-18 12:38:40)
From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>

A bunch of tests for the new i915 PMU feature.

Parts of the code were initialy sketched by Dmitry Rogozhkin.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Dmitry Rogozhkin <dmitry.v.rogozhkin@xxxxxxxxx>
---
  lib/igt_gt.c           |  23 +-
  lib/igt_gt.h           |   8 +
  tests/Makefile.sources |   1 +
  tests/perf_pmu.c       | 713 +++++++++++++++++++++++++++++++++++++++++++++++++
  4 files changed, 738 insertions(+), 7 deletions(-)
  create mode 100644 tests/perf_pmu.c

diff --git a/lib/igt_gt.c b/lib/igt_gt.c
index b3f3b3809eee..102cc2841feb 100644
--- a/lib/igt_gt.c
+++ b/lib/igt_gt.c
@@ -537,14 +537,23 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd)
         return missed;
  }
  
+enum drm_i915_gem_engine_class {
+       I915_ENGINE_CLASS_OTHER = 0,
+       I915_ENGINE_CLASS_RENDER = 1,
+       I915_ENGINE_CLASS_COPY = 2,
+       I915_ENGINE_CLASS_VIDEO = 3,
+       I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
+       I915_ENGINE_CLASS_MAX /* non-ABI */
+};
+
  const struct intel_execution_engine intel_execution_engines[] = {
-       { "default", NULL, 0, 0 },
-       { "render", "rcs0", I915_EXEC_RENDER, 0 },
-       { "bsd", "vcs0", I915_EXEC_BSD, 0 },
-       { "bsd1", "vcs0", I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
-       { "bsd2", "vcs1", I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
-       { "blt", "bcs0", I915_EXEC_BLT, 0 },
-       { "vebox", "vecs0", I915_EXEC_VEBOX, 0 },
+       { "default", NULL, -1, -1, 0, 0 },
+       { "render", "rcs0", I915_ENGINE_CLASS_RENDER, 0, I915_EXEC_RENDER, 0 },
+       { "bsd", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 0 },
+       { "bsd1", "vcs0", I915_ENGINE_CLASS_VIDEO, 0, I915_EXEC_BSD, 1<<13 /*I915_EXEC_BSD_RING1*/ },
+       { "bsd2", "vcs1", I915_ENGINE_CLASS_VIDEO, 1, I915_EXEC_BSD, 2<<13 /*I915_EXEC_BSD_RING2*/ },
+       { "blt", "bcs0", I915_ENGINE_CLASS_COPY, 0, I915_EXEC_BLT, 0 },
+       { "vebox", "vecs0", I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, I915_EXEC_VEBOX, 0 },
         { NULL, 0, 0 }

I was anticipating a new struct for the explicit interface so that we
can easily phase out the out with its aliasing.

It's definitely buggy as it is as Dmitry has discovered. I'll have a 
think on how to do it elegantly. Too bad we can't piggy back the 
class-instance execbuf to this..

  };
  
diff --git a/lib/igt_gt.h b/lib/igt_gt.h
index 2579cbd37be7..436041ce9cc0 100644
--- a/lib/igt_gt.h
+++ b/lib/igt_gt.h
@@ -66,6 +66,8 @@ unsigned intel_detect_and_clear_missed_interrupts(int fd);
  extern const struct intel_execution_engine {
         const char *name;
         const char *full_name;
+       int class;
+       int instance;
         unsigned exec_id;
         unsigned flags;
  } intel_execution_engines[];
@@ -78,6 +80,12 @@ extern const struct intel_execution_engine {
              e__++) \
                 for_if (gem_has_ring(fd__, flags__ = e__->exec_id | e__->flags))
  
+#define for_each_engine_class_instance(fd__, e__) \
+       for ((e__) = intel_execution_engines;\
+            (e__)->name; \
+            (e__)++) \
+               for_if ((e__)->class > 0)
+
  bool gem_can_store_dword(int fd, unsigned int engine);
  
  #endif /* IGT_GT_H */
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index cf542df181a8..4bab6247151c 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -217,6 +217,7 @@ TESTS_progs = \
         kms_vblank \
         meta_test \
         perf \
+       perf_pmu \
         pm_backlight \
         pm_lpsp \
         pm_rc6_residency \
diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
new file mode 100644
index 000000000000..2dbee586dacc
--- /dev/null
+++ b/tests/perf_pmu.c
@@ -0,0 +1,713 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/times.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <time.h>
+#include <poll.h>
+
+#include "igt.h"
+#include "igt_perf.h"
+
+IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface");
+
+const double tolerance = 0.02f;
+const unsigned long batch_duration_ns = 1000 * 1000 * 1000 / 2;
+
+static void
+init(int gem_fd, const struct intel_execution_engine *e, uint8_t sample)
+{
+       uint64_t config = __I915_PMU_ENGINE(e->class, e->instance, sample);
+       int fd;
+
+       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));

gem_require_ring()

Missed the existance of it.



+
+       fd = perf_i915_open(config);

Although the kernel interface is the authority.

So this should be igt_require, and igt_assert(has_ring);

Don't get what you are saying, igt_require(has_ring) followed by 
igt_assert(has_ring)??


+       igt_assert(fd >= 0);
+
+       close(fd);
+}
+
+static uint64_t pmu_read_single(int fd)
+{
+       uint64_t data[2];
+       ssize_t len;
+
+       len = read(fd, data, sizeof(data));

Perf is a datagram api, right? A short read gives what you asked for and
discards the rest of the packet, iirc.

Nope, I've noticed overlay was failing due that assumption and even 
traced the code in core perf which fails short reads. Hence the patch in 
this series to fix overlay in that respect.


+       igt_assert_eq(len, sizeof(data));
+
+       return data[0];
+}
+
+static void pmu_read_multi(int fd, unsigned int num, uint64_t *val)
+{
+       uint64_t buf[2 + num];
+       unsigned int i;
+       ssize_t len;
+
+       len = read(fd, buf, sizeof(buf));
+       igt_assert_eq(len, sizeof(buf));
+       for (i = 0; i < num; i++)
+               val[i] = buf[2 + i];
+}
+
+#define assert_within_epsilon(x, ref, tolerance) \
+       igt_assert_f((double)(x) <= (1.0 + tolerance) * (double)ref && \
+                    (double)(x) >= (1.0 - tolerance) * (double)ref, \
+                    "'%s' != '%s' (%f not within %f%% tolerance of %f)\n",\
+                    #x, #ref, (double)x, tolerance * 100.0, (double)ref)
+
+static void
+single(int gem_fd, const struct intel_execution_engine *e, bool busy)
+{
+       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
+       double ref = busy ? batch_duration_ns : 0.0f;
+       igt_spin_t *spin;
+       uint64_t val;
+       int fd;
+
+       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+
+       if (busy) {
+               spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
+               igt_spin_batch_set_timeout(spin, batch_duration_ns);
+       } else {
+               usleep(batch_duration_ns / 1000);
+       }
+
+       fd = perf_i915_open(config);
+       igt_assert(fd >= 0);
+
+       if (busy)
+               gem_sync(gem_fd, spin->handle);
+
+       val = pmu_read_single(fd);
+
+       assert_within_epsilon(val, ref, tolerance);
+
+       if (busy)
+               igt_spin_batch_free(gem_fd, spin);
+       close(fd);
+}
+
+static void
+busy_check_all(int gem_fd, const struct intel_execution_engine *e,

busy_check_others

busy_check_all I would expect to be checking that all engines are
correctly recorded as being busy at the same time. And there should also
be permutations of (busy, idle, wait) across the engines.

I can do that, sure. But it is checking all engines, just some for 100% 
busy, and some for 100% idle. :) Naming it other would then be not 
correct either.


+              const unsigned int num_engines)
+{
+       const struct intel_execution_engine *e_;
+       uint64_t val[num_engines];
+       int fd[2];
+       igt_spin_t *spin;
+       unsigned int busy_idx, i;
+
+       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+
+       spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
+       igt_spin_batch_set_timeout(spin, batch_duration_ns);
+
+       i = 0;
+       fd[0] = -1;
+       for_each_engine_class_instance(fd, e_) {
+               if (!gem_has_ring(gem_fd, e_->exec_id | e_->flags))
+                       continue;
+
+               if (e == e_)
+                       busy_idx = i;
+
+               fd[i == 0 ? 0 : 1] =
+                       perf_i915_open_group(I915_PMU_ENGINE_BUSY(e_->class,
+                                                                 e_->instance),
+                                            fd[0]);
+               igt_assert(fd[0] > 0);
+               igt_assert(i == 0 || fd[1] > 0);
+               i++;
+       }
+
+       gem_sync(gem_fd, spin->handle);
+
+       pmu_read_multi(fd[0], num_engines, val);
+
+       assert_within_epsilon(val[busy_idx], batch_duration_ns, tolerance);
+       for (i = 0; i < num_engines; i++) {
+               if (i == busy_idx)
+                       continue;
+               assert_within_epsilon(val[i], 0.0f, tolerance);
+       }
+
+       igt_spin_batch_free(gem_fd, spin);
+       close(fd[0]);
+}

+static void
+no_sema(int gem_fd, const struct intel_execution_engine *e, bool busy)

This is just the sanity check half of the sema test.

No wait, no queued?

Forgot about queued completely!

And semaphores I left for later. I don't have any <gen9 machines to play 
with them locally.


+static void
+multi_client(int gem_fd, const struct intel_execution_engine *e)
+{
+       uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance);
+       igt_spin_t *spin;
+       uint64_t val[2];
+       int fd[2];
+
+       igt_require(gem_has_ring(gem_fd, e->exec_id | e->flags));
+
+       spin = igt_spin_batch_new(gem_fd, 0, e->exec_id | e->flags, 0);
+       igt_spin_batch_set_timeout(spin, batch_duration_ns);
+
+       fd[0] = perf_i915_open(config);
+       igt_assert(fd[0] >= 0);
+
+       usleep(batch_duration_ns / 4000);
+
+       fd[1] = perf_i915_open(config);
+       igt_assert(fd[1] >= 0);
+
+       usleep(batch_duration_ns / 3000);
+
+       val[1] = pmu_read_single(fd[1]);
+       close(fd[1]);
+
+       gem_sync(gem_fd, spin->handle);
+
+       val[0] = pmu_read_single(fd[0]);
+
+       assert_within_epsilon(val[0], batch_duration_ns, tolerance);
+       assert_within_epsilon(val[1], batch_duration_ns / 3, tolerance);
+
+       igt_spin_batch_free(gem_fd, spin);
+       close(fd[0]);
+}


Forgot to comment or changed your mind?

+static void cpu_hotplug(int gem_fd)
+{
+       struct timespec start, now;
+       igt_spin_t *spin;
+       uint64_t val, ref;
+       int fd;
+
+       igt_require(cpu0_hotplug_support());
+
+       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+       fd = perf_i915_open(I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0));
+       igt_assert(fd >= 0);
+
+       clock_gettime(CLOCK_MONOTONIC, &start);
+
+       igt_fork(child, 1) {
+               int cpu = 0;
+
+               for (;;) {
+                       char name[128];
+                       int cpufd;
+
+                       sprintf(name, "/sys/devices/system/cpu/cpu%d/online",
+                               cpu);
+                       cpufd = open(name, O_WRONLY);
+                       if (cpufd == -1) {
+                               igt_assert(cpu > 0);
+                               break;
+                       }
+                       igt_assert_eq(write(cpufd, "0", 2), 2);
+
+                       usleep(1000 * 1000);
+
+                       igt_assert_eq(write(cpufd, "1", 2), 2);
+
+                       close(cpufd);
+                       cpu++;
+               }
+       }
+
+       igt_waitchildren();
+
+       igt_spin_batch_end(spin);
+       gem_sync(gem_fd, spin->handle);
+
+       clock_gettime(CLOCK_MONOTONIC, &now);

Did we ever export the igt routines for probing supported clocks?
In this case, this fits into igt_nsec_elapsed.

Did not spot this one either.


+       val = pmu_read_single(fd);
+
+       ref = elapsed_ns(&start, &now);
+
+       assert_within_epsilon(val, ref, tolerance);
+
+       igt_spin_batch_free(gem_fd, spin);
+       close(fd);
+}
+
+static void
+test_interrupts(int gem_fd)
+{
+       igt_spin_t *spin;
+       uint64_t idle, busy, prev;
+       int fd;
+
+       fd = perf_i915_open(I915_PMU_INTERRUPTS);
+       igt_assert(fd >= 0);
+
+       gem_quiescent_gpu(gem_fd);
+       sleep(2);
+       prev = pmu_read_single(fd);
+       usleep(batch_duration_ns / 1000);
+       idle = pmu_read_single(fd);
+
+       igt_assert_eq(idle - prev, 0);
+
+       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+       igt_spin_batch_set_timeout(spin, batch_duration_ns);
+       gem_sync(gem_fd, spin->handle);

There's no guaranteed interrupt here.

Hm yes.. bugger. Use fences instead of spin batch to ensure some? Or 
extend spin batch API to support fences?


+
+       busy = pmu_read_single(fd);
+       igt_assert(busy > idle);
+
+       igt_spin_batch_free(gem_fd, spin);
+       close(fd);
+}
+
+static void
+test_frequency(int gem_fd)
+{
+       igt_spin_t *spin;
+       uint64_t idle[2], busy[2];
+       int fd;
+
+       fd = perf_i915_open_group(I915_PMU_REQUESTED_FREQUENCY, -1);
+       igt_assert(fd >= 0);

Ask the kernel if it is supported.

Yep.


+       igt_assert(perf_i915_open_group(I915_PMU_ACTUAL_FREQUENCY, fd) >= 0);
+
+       gem_quiescent_gpu(gem_fd);
+       usleep(batch_duration_ns / 1000);
+
+       pmu_read_multi(fd, 2, idle);
+
+       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+       igt_spin_batch_set_timeout(spin, batch_duration_ns);
+       gem_sync(gem_fd, spin->handle);
+
+       pmu_read_multi(fd, 2, busy);
+
+       igt_assert(busy[0] > idle[0]);
+       igt_assert(busy[1] > idle[1]);

Nothing guarantees busy[1] changes, it is hw/fw dependent.
busy[0] depends on user config.

Do we reasonably expect IGT to be ran in such environments? Or change 
this to not expect a change but just compare against debugfs?


+
+       igt_spin_batch_free(gem_fd, spin);
+       close(fd);
+}
+

+static void
+test_rc6p(int gem_fd)
+{
+       const unsigned int devid = intel_get_drm_devid(gem_fd);
+       int64_t duration_ns = 2 * 1000 * 1000 * 1000;
+       unsigned int num_pmu = 1;
+       igt_spin_t *spin;
+       uint64_t idle[3], busy[3], prev[3];
+       unsigned int i;
+       int fd, ret;
+
+       igt_require(intel_gen(devid) < 8 && !IS_HASWELL(devid));

Ask the kernel. (Applies equally to rc6, rc6p).

What is the way to do this? Don't see these in get_param.

No rc6pp testing?

Copy and paste error.

+
+       fd = perf_i915_open_group(I915_PMU_RC6_RESIDENCY, -1);
+       igt_assert(fd >= 0);
+
+       ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
+       if (ret > 0) {
+               num_pmu++;
+               ret = perf_i915_open_group(I915_PMU_RC6p_RESIDENCY, fd);
+               if (ret > 0)
+                       num_pmu++;
+       }
+
+       gem_quiescent_gpu(gem_fd);
+       sleep(2);
+
+       pmu_read_multi(fd, num_pmu, prev);
+       usleep(duration_ns / 1000);
+       pmu_read_multi(fd, num_pmu, idle);
+
+       for (i = 0; i < num_pmu; i++)
+               assert_within_epsilon(idle[i] - prev[i], duration_ns,
+                                     tolerance);
+
+       spin = igt_spin_batch_new(gem_fd, 0, I915_EXEC_RENDER, 0);
+       igt_spin_batch_set_timeout(spin, duration_ns);

Are we sure the GPU isn't allowed to sleep? i915_user_forcewake we
expect to keep the GPU out of rc6.

I was sure, but was I wrong? :)

Regardless, replacing spin batch with a forcewake sounds simpler so I 
can do that.


+igt_main
+{
+       const unsigned int num_other_metrics =
+                               I915_PMU_LAST - __I915_PMU_OTHER(0) + 1;
+       unsigned int num_engines = 0;
+       int fd = -1;
+       const struct intel_execution_engine *e;
+       unsigned int i;
+
+       igt_fixture {
+               fd = drm_open_driver_master(DRIVER_INTEL);
+
+               igt_require_gem(fd);
+               igt_require(i915_type_id() > 0);
+
+               for_each_engine_class_instance(fd, e) {
+                       if (gem_has_ring(fd, e->exec_id | e->flags))
+                               num_engines++;
+               }
+       }
+
+       /**
+        * Test invalid access via perf API is rejected.
+        */

ARGH. No comments on the intentions of the code?

Will add.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx