Quoting Tvrtko Ursulin (2017-09-25 16:15:01) > From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > > Add busy and busy-avg balancers which make balancing > decisions by looking at engine busyness via the i915 PMU. "And thus are able to make decisions on the actual instantaneous load of the system, and not use metrics that lag behind by a batch or two. In doing so, each client should be able to greedily maximise their own usage of the system, leading to improved load balancing even in the face of other uncooperative clients. On the other hand, we are only using the instantaneous load without coupling in the predictive factor for dispatch and execution length." Hmm, do you not want to sum busy + queued? Or at least compare! :) > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > --- > benchmarks/gem_wsim.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 140 insertions(+) > > diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c > index 82fe6ba9ec5f..9ee91fabb220 100644 > --- a/benchmarks/gem_wsim.c > +++ b/benchmarks/gem_wsim.c > @@ -50,6 +50,7 @@ > #include "intel_io.h" > #include "igt_aux.h" > #include "igt_rand.h" > +#include "igt_perf.h" > #include "sw_sync.h" > > #include "ewma.h" > @@ -188,6 +189,16 @@ struct workload > uint32_t last[NUM_ENGINES]; > } rt; > }; > + > + struct busy_balancer { > + int fd; > + bool first; > + unsigned int num_engines; > + unsigned int engine_map[5]; > + uint64_t t_prev; > + uint64_t prev[5]; > + double busy[5]; > + } busy_balancer; > }; > > static const unsigned int nop_calibration_us = 1000; > @@ -993,6 +1004,8 @@ struct workload_balancer { > unsigned int flags; > unsigned int min_gen; > > + int (*init)(const struct workload_balancer *balancer, > + struct workload *wrk); > unsigned int (*get_qd)(const struct workload_balancer *balancer, > struct workload *wrk, > enum intel_engine_id engine); > @@ -1242,6 +1255,106 @@ context_balance(const struct workload_balancer *balancer, > return get_vcs_engine(wrk->ctx_list[w->context].static_vcs); > } > > +static unsigned int > +get_engine_busy(const struct workload_balancer *balancer, > + struct workload *wrk, enum intel_engine_id engine) > +{ > + struct busy_balancer *bb = &wrk->busy_balancer; > + > + if (engine == VCS2 && (wrk->flags & VCS2REMAP)) > + engine = BCS; > + > + return bb->busy[bb->engine_map[engine]]; > +} > + > +static void get_stats(const struct workload_balancer *b, struct workload *wrk) s/get_stats/get_pmu_stats/ ? > +{ > + struct busy_balancer *bb = &wrk->busy_balancer; > + uint64_t val[7]; > + unsigned int i; > + > + igt_assert_eq(read(bb->fd, val, sizeof(val)), sizeof(val)); > + > + if (!bb->first) { > + for (i = 0; i < bb->num_engines; i++) { > + double d; > + > + d = (val[2 + i] - bb->prev[i]) * 100; > + d /= val[1] - bb->t_prev; > + bb->busy[i] = d; > + } > + } > + > + for (i = 0; i < bb->num_engines; i++) > + bb->prev[i] = val[2 + i]; > + > + bb->t_prev = val[1]; > + bb->first = false; Ok. > +} > + > +static enum intel_engine_id > +busy_avg_balance(const struct workload_balancer *balancer, > + struct workload *wrk, struct w_step *w) > +{ > + get_stats(balancer, wrk); > + > + return qdavg_balance(balancer, wrk, w); > +} > + > +static enum intel_engine_id > +busy_balance(const struct workload_balancer *balancer, > + struct workload *wrk, struct w_step *w) > +{ > + get_stats(balancer, wrk); > + > + return qd_balance(balancer, wrk, w); > +} > + > +static int > +busy_init(const struct workload_balancer *balancer, struct workload *wrk) > +{ > + struct busy_balancer *bb = &wrk->busy_balancer; > + struct engine_desc { > + unsigned class, inst; > + enum intel_engine_id id; > + } *d, engines[] = { > + { I915_ENGINE_CLASS_RENDER, 0, RCS }, > + { I915_ENGINE_CLASS_COPY, 0, BCS }, > + { I915_ENGINE_CLASS_VIDEO, 0, VCS1 }, > + { I915_ENGINE_CLASS_VIDEO, 1, VCS2 }, > + { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS }, > + { 0, 0, VCS } > + }; > + > + bb->num_engines = 0; > + bb->first = true; > + bb->fd = -1; > + > + for (d = &engines[0]; d->id != VCS; d++) { > + int pfd; > + > + pfd = perf_i915_open_group(I915_PMU_ENGINE_BUSY(d->class, > + d->inst), > + bb->fd); > + if (pfd < 0) { > + if (d->id != VCS2) > + return -(10 + bb->num_engines); > + else > + continue; > + } > + > + if (bb->num_engines == 0) > + bb->fd = pfd; > + > + bb->engine_map[d->id] = bb->num_engines++; > + } > + > + if (bb->num_engines < 5 && !(wrk->flags & VCS2REMAP)) > + return -1; Hmm, feels a little sloppy. Would be more concrete if we have a list of engines were going to use, and then we either created a perf event for each or died. Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx