Quoting Tvrtko Ursulin (2018-02-15 11:53:44) > From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > > A subtest to verify that the engine busyness is reported with expected > accuracy on platforms where the feature is available. > > We test three patterns: 2%, 50% and 98% load per engine. > > v2: > * Use spin batch instead of nop calibration. > * Various tweaks. > > v3: > * Change loops to be time based. > * Use __igt_spin_batch_new inside timing sensitive loops. > * Fixed PWM sleep handling. > > v4: > * Use restarting spin batch. > * Calibrate more carefully by looking at the real PWM loop. > > v5: > * Made standalone. > * Better info messages. > * Tweak sleep compensation. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > --- > tests/perf_pmu.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 174 insertions(+), 18 deletions(-) > > diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c > index a7501ca5f7a4..fa9b54793a52 100644 > --- a/tests/perf_pmu.c > +++ b/tests/perf_pmu.c > @@ -35,6 +35,7 @@ > #include <dirent.h> > #include <time.h> > #include <poll.h> > +#include <sched.h> > > #include "igt.h" > #include "igt_core.h" > @@ -385,6 +386,22 @@ busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, > gem_quiescent_gpu(gem_fd); > } > > +static void > +__submit_spin_batch(int gem_fd, igt_spin_t *spin, > + const struct intel_execution_engine2 *e) > +{ > + struct drm_i915_gem_exec_object2 obj = { > + .handle = spin->handle > + }; > + struct drm_i915_gem_execbuffer2 eb = { > + .buffer_count = 1, > + .buffers_ptr = to_user_pointer(&obj), > + .flags = e2ring(gem_fd, e), > + }; > + > + gem_execbuf(gem_fd, &eb); > +} > + > static void > most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, > const unsigned int num_engines, unsigned int flags) > @@ -405,15 +422,7 @@ most_busy_check_all(int gem_fd, const struct intel_execution_engine2 *e, > if (e == e_) { > idle_idx = i; > } else if (spin) { > - struct drm_i915_gem_exec_object2 obj = { > - .handle = spin->handle > - }; > - struct drm_i915_gem_execbuffer2 eb = { > - .buffer_count = 1, > - .buffers_ptr = to_user_pointer(&obj), > - .flags = e2ring(gem_fd, e_), > - }; > - gem_execbuf(gem_fd, &eb); > + __submit_spin_batch(gem_fd, spin, e_); > } else { > spin = igt_spin_batch_new(gem_fd, 0, > e2ring(gem_fd, e_), 0); > @@ -469,15 +478,7 @@ all_busy_check_all(int gem_fd, const unsigned int num_engines, > continue; > > if (spin) { > - struct drm_i915_gem_exec_object2 obj = { > - .handle = spin->handle > - }; > - struct drm_i915_gem_execbuffer2 eb = { > - .buffer_count = 1, > - .buffers_ptr = to_user_pointer(&obj), > - .flags = e2ring(gem_fd, e), > - }; > - gem_execbuf(gem_fd, &eb); > + __submit_spin_batch(gem_fd, spin, e); > } else { > spin = igt_spin_batch_new(gem_fd, 0, > e2ring(gem_fd, e), 0); > @@ -1390,6 +1391,150 @@ test_enable_race(int gem_fd, const struct intel_execution_engine2 *e) > gem_quiescent_gpu(gem_fd); > } > > +static double __error(double val, double ref) > +{ > + igt_assert(ref != 0.0); igt_assert(ref > 1e-5 /* smallval */) ? Pretty sure a negative ref is also cause for confusion :) > + return (100.0 * val / ref) - 100.0; > +} > + > +static void __rearm_spin_batch(igt_spin_t *spin) > +{ > + const uint32_t mi_arb_chk = 0x5 << 23; > + > + *spin->batch = mi_arb_chk; > + __sync_synchronize(); > +} > + > +#define div_round_up(a, b) (((a) + (b) - 1) / (b)) > + > +static void > +accuracy(int gem_fd, const struct intel_execution_engine2 *e, > + unsigned long target_busy_pct) > +{ > + const unsigned int min_test_loops = 7; > + const unsigned long min_test_us = 1e6; > + unsigned long busy_us = 2500; > + unsigned long idle_us = 100 * (busy_us - target_busy_pct * > + busy_us / 100) / target_busy_pct; > + unsigned long pwm_calibration_us; > + unsigned long test_us; > + double busy_r; > + uint64_t val[2]; > + uint64_t ts[2]; > + int fd; > + > + /* Sampling platforms cannot reach the high accuracy criteria. */ > + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); igt_require(gem_has_execlists(gem_fd)); > + > + while (idle_us < 2500) { > + busy_us *= 2; > + idle_us *= 2; > + } > + > + pwm_calibration_us = min_test_loops * (busy_us + idle_us); > + while (pwm_calibration_us < min_test_us) > + pwm_calibration_us += busy_us + idle_us; > + test_us = min_test_loops * (idle_us + busy_us); > + while (test_us < min_test_us) > + test_us += busy_us + idle_us; > + > + igt_info("calibration=%luus, test=%luus; busy=%luus, idle=%luus\n", > + pwm_calibration_us, test_us, busy_us, idle_us); Would also be nice to show the adjusted %%. > + assert_within_epsilon((double)busy_us / (busy_us + idle_us), > + (double)target_busy_pct / 100.0, tolerance); > + > + /* Emit PWM pattern on the engine from a child. */ > + igt_fork(child, 1) { > + struct sched_param rt = { .sched_priority = 99 }; > + const unsigned long timeout[] = { pwm_calibration_us * 1000, > + test_us * 2 * 1000 }; > + unsigned long sleep_busy = busy_us; > + unsigned long sleep_idle = idle_us; > + igt_spin_t *spin; > + > + /* We need the best sleep accuracy we can get. */ > + igt_require(sched_setscheduler(0, > + SCHED_FIFO | SCHED_RESET_ON_FORK, > + &rt) == 0); Can't use igt_require() or igt_assert() from children. So just igt_warn if not applied. Just SCHED_FIFO is enough as the child doesn't/shouldn't fork. > + > + /* Allocate our spin batch and idle it. */ > + spin = igt_spin_batch_new(gem_fd, 0, e2ring(gem_fd, e), 0); > + igt_spin_batch_end(spin); > + gem_sync(gem_fd, spin->handle); > + > + /* 1st pass is calibration, second pass is the test. */ > + for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) { > + unsigned long busy_ns = 0, idle_ns = 0; > + struct timespec test_start = { }; > + unsigned long loops = 0; > + double err_busy, err_idle; > + > + igt_nsec_elapsed(&test_start); > + do { > + struct timespec t_busy = { }; > + > + igt_nsec_elapsed(&t_busy); > + > + /* Restart the spinbatch. */ > + __rearm_spin_batch(spin); > + __submit_spin_batch(gem_fd, spin, e); > + measured_usleep(sleep_busy); > + igt_spin_batch_end(spin); > + gem_sync(gem_fd, spin->handle); > + > + busy_ns += igt_nsec_elapsed(&t_busy); > + > + idle_ns += measured_usleep(sleep_idle); > + > + loops++; > + } while (igt_nsec_elapsed(&test_start) < timeout[pass]); > + > + busy_ns = div_round_up(busy_ns, loops); > + idle_ns = div_round_up(idle_ns, loops); > + > + err_busy = __error(busy_ns / 1000, busy_us); > + err_idle = __error(idle_ns / 1000, idle_us); > + > + igt_info("%u: busy %lu/%lu %.2f%%, idle %lu/%lu %.2f%%\n", > + pass, > + busy_ns / 1000, busy_us, err_busy, > + idle_ns / 1000, idle_us, err_idle); Ok, makes sense. > + > + if (pass == 0) { > + sleep_busy = (double)busy_us - > + (double)busy_us * err_busy / 100.0; > + sleep_idle = (double)idle_us - > + (double)idle_us * err_idle / 100.0; > + igt_info("calibrated sleeps: busy=%lu, idle=%lu\n", > + sleep_busy, sleep_idle); > + } > + } > + > + igt_spin_batch_free(gem_fd, spin); > + } > + > + /* Let the child run. */ > + usleep(pwm_calibration_us * 2); > + > + /* Collect engine busyness for an interesting part of child runtime. */ > + fd = open_pmu(I915_PMU_ENGINE_BUSY(e->class, e->instance)); > + val[0] = __pmu_read_single(fd, &ts[0]); > + usleep(test_us / 2); > + val[1] = __pmu_read_single(fd, &ts[1]); > + close(fd); > + > + igt_waitchildren(); > + > + busy_r = (double)(val[1] - val[0]) / (ts[1] - ts[0]); > + > + igt_info("error=%.2f%% (%.2f%% vs %lu%%)\n", > + __error(busy_r, target_busy_pct / 100.0), > + busy_r * 100.0, target_busy_pct); > + > + assert_within_epsilon(busy_r, (double)target_busy_pct / 100.0, 0.15); > +} A fine compromise! :) Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx