Quoting Tvrtko Ursulin (2019-05-15 11:49:45) > > On 08/05/2019 11:09, Chris Wilson wrote: > > Exercise the in-kernel load balancer checking that we can distribute > > batches across the set of ctx->engines to avoid load. > > > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > > --- > > tests/Makefile.am | 1 + > > tests/Makefile.sources | 1 + > > tests/i915/gem_exec_balancer.c | 1050 ++++++++++++++++++++++++++++++++ > > tests/meson.build | 7 + > > 4 files changed, 1059 insertions(+) > > create mode 100644 tests/i915/gem_exec_balancer.c > > > > diff --git a/tests/Makefile.am b/tests/Makefile.am > > index 5097debf6..c6af0aeaf 100644 > > --- a/tests/Makefile.am > > +++ b/tests/Makefile.am > > @@ -96,6 +96,7 @@ gem_close_race_LDADD = $(LDADD) -lpthread > > gem_ctx_thrash_CFLAGS = $(AM_CFLAGS) $(THREAD_CFLAGS) > > gem_ctx_thrash_LDADD = $(LDADD) -lpthread > > gem_ctx_sseu_LDADD = $(LDADD) $(top_builddir)/lib/libigt_perf.la > > +i915_gem_exec_balancer_LDADD = $(LDADD) $(top_builddir)/lib/libigt_perf.la > > gem_exec_capture_LDADD = $(LDADD) -lz > > gem_exec_parallel_CFLAGS = $(AM_CFLAGS) $(THREAD_CFLAGS) > > gem_exec_parallel_LDADD = $(LDADD) -lpthread > > diff --git a/tests/Makefile.sources b/tests/Makefile.sources > > index e7ee27e81..323b625aa 100644 > > --- a/tests/Makefile.sources > > +++ b/tests/Makefile.sources > > @@ -24,6 +24,7 @@ TESTS_progs = \ > > i915/gem_ctx_clone \ > > i915/gem_ctx_engines \ > > i915/gem_ctx_shared \ > > + i915/gem_exec_balancer \ > > i915/gem_vm_create \ > > kms_3d \ > > kms_addfb_basic \ > > diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c > > new file mode 100644 > > index 000000000..25195d478 > > --- /dev/null > > +++ b/tests/i915/gem_exec_balancer.c > > @@ -0,0 +1,1050 @@ > > +/* > > + * Copyright © 2018 Intel Corporation > > 2019 I guess, even though work was started in 2018? > > > + * > > + * Permission is hereby granted, free of charge, to any person obtaining a > > + * copy of this software and associated documentation files (the "Software"), > > + * to deal in the Software without restriction, including without limitation > > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > > + * and/or sell copies of the Software, and to permit persons to whom the > > + * Software is furnished to do so, subject to the following conditions: > > + * > > + * The above copyright notice and this permission notice (including the next > > + * paragraph) shall be included in all copies or substantial portions of the > > + * Software. > > + * > > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS > > + * IN THE SOFTWARE. > > + */ > > + > > +#include <sched.h> > > + > > +#include "igt.h" > > +#include "igt_perf.h" > > +#include "i915/gem_ring.h" > > +#include "sw_sync.h" > > + > > +IGT_TEST_DESCRIPTION("Exercise in-kernel load-balancing"); > > + > > +#define INSTANCE_COUNT (1 << I915_PMU_SAMPLE_INSTANCE_BITS) > > Hmm.. this is a strange surrogate but I guess it works. > > > + > > +static bool has_class_instance(int i915, uint16_t class, uint16_t instance) > > +{ > > + int fd; > > + > > + fd = perf_i915_open(I915_PMU_ENGINE_BUSY(class, instance)); > > More work for Andi to replace with real engine discovery. :) > > > + if (fd != -1) { > > + close(fd); > > + return true; > > + } > > + > > + return false; > > +} > > + > > +static struct i915_engine_class_instance * > > +list_engines(int i915, uint32_t class_mask, unsigned int *out) > > +{ > > + unsigned int count = 0, size = 64; > > + struct i915_engine_class_instance *engines; > > + > > + engines = malloc(size * sizeof(*engines)); > > + if (!engines) { > > + *out = 0; > > + return NULL; > > + } > > + > > + for (enum drm_i915_gem_engine_class class = I915_ENGINE_CLASS_RENDER; > > + class_mask; > > + class++, class_mask >>= 1) { > > + if (!(class_mask & 1)) > > + continue; > > + > > + for (unsigned int instance = 0; > > + instance < INSTANCE_COUNT; > > + instance++) { > > + if (!has_class_instance(i915, class, instance)) > > + continue; > > + > > + if (count == size) { > > + struct i915_engine_class_instance *e; > > + > > + size *= 2; > > + e = realloc(engines, size*sizeof(*engines)); > > + if (!e) { > > I'd just assert. On malloc as well. > > > + *out = count; > > + return engines; > > + } > > + > > + engines = e; > > + } > > + > > + engines[count++] = (struct i915_engine_class_instance){ > > + .engine_class = class, > > + .engine_instance = instance, > > + }; > > + } > > + } > > + > > + if (!count) { > > + free(engines); > > + engines = NULL; > > + } > > + > > + *out = count; > > + return engines; > > +} > > + > > +static int __set_load_balancer(int i915, uint32_t ctx, > > + const struct i915_engine_class_instance *ci, > > + unsigned int count) > > +{ > > + I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(balancer, count); > > + I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 1 + count); > > + struct drm_i915_gem_context_param p = { > > + .ctx_id = ctx, > > + .param = I915_CONTEXT_PARAM_ENGINES, > > + .size = sizeof(engines), > > + .value = to_user_pointer(&engines) > > + }; > > + > > + memset(&balancer, 0, sizeof(balancer)); > > + balancer.base.name = I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE; > > + > > + igt_assert(count); > > + balancer.num_siblings = count; > > + memcpy(balancer.engines, ci, count * sizeof(*ci)); > > + > > + memset(&engines, 0, sizeof(engines)); > > + engines.extensions = to_user_pointer(&balancer); > > + engines.engines[0].engine_class = > > + I915_ENGINE_CLASS_INVALID; > > + engines.engines[0].engine_instance = > > + I915_ENGINE_CLASS_INVALID_NONE; > > + memcpy(engines.engines + 1, ci, count * sizeof(*ci)); > > + > > + return __gem_context_set_param(i915, &p); > > +} > > + > > +static void set_load_balancer(int i915, uint32_t ctx, > > + const struct i915_engine_class_instance *ci, > > + unsigned int count) > > +{ > > + igt_assert_eq(__set_load_balancer(i915, ctx, ci, count), 0); > > +} > > + > > +static uint32_t load_balancer_create(int i915, > > + const struct i915_engine_class_instance *ci, > > + unsigned int count) > > +{ > > + uint32_t ctx; > > + > > + ctx = gem_context_create(i915); > > + set_load_balancer(i915, ctx, ci, count); > > + > > + return ctx; > > +} > > + > > +static uint32_t __batch_create(int i915, uint32_t offset) > > +{ > > + const uint32_t bbe = MI_BATCH_BUFFER_END; > > + uint32_t handle; > > + > > + handle = gem_create(i915, ALIGN(offset + 4, 4096)); > > + gem_write(i915, handle, offset, &bbe, sizeof(bbe)); > > + > > + return handle; > > +} > > + > > +static uint32_t batch_create(int i915) > > +{ > > + return __batch_create(i915, 0); > > +} > > + > > +static void invalid_balancer(int i915) > > +{ > > + I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(balancer, 64); > > + I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 64); > > + struct drm_i915_gem_context_param p = { > > + .param = I915_CONTEXT_PARAM_ENGINES, > > + .value = to_user_pointer(&engines) > > + }; > > + uint32_t handle; > > + void *ptr; > > + > > + /* > > + * Assume that I915_CONTEXT_PARAM_ENGINE validates the array > > + * of engines[], our job is to determine if the load_balancer > > + * extension explodes. > > + */ > > + > > + for (int class = 0; class < 32; class++) { > > + struct i915_engine_class_instance *ci; > > + unsigned int count; > > + > > + ci = list_engines(i915, 1 << class, &count); > > + if (!ci) > > + continue; > > + > > + igt_debug("Found %d engines\n", count); > > + igt_assert_lte(count, 64); > > Hey.. you always say trust the kernel! ;) This code was placeholder that you said you would replace by a proper query api... > > > + > > + p.ctx_id = gem_context_create(i915); > > + p.size = (sizeof(struct i915_context_param_engines) + > > + (count + 1) * sizeof(*engines.engines)); > > Alignment looks off. > > > + > > + memset(&engines, 0, sizeof(engines)); > > + engines.engines[0].engine_class = I915_ENGINE_CLASS_INVALID; > > + engines.engines[0].engine_instance = I915_ENGINE_CLASS_INVALID_NONE; > > + memcpy(engines.engines + 1, ci, count * sizeof(*ci)); > > + gem_context_set_param(i915, &p); > > + > > + engines.extensions = -1ull; > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT); > > + > > + engines.extensions = 1ull; > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT); > > + > > + memset(&balancer, 0, sizeof(balancer)); > > + balancer.base.name = I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE; > > + balancer.num_siblings = count; > > + memcpy(balancer.engines, ci, count * sizeof(*ci)); > > + > > + engines.extensions = to_user_pointer(&balancer); > > + gem_context_set_param(i915, &p); > > + > > + balancer.engine_index = 1; > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EEXIST); > > + > > + balancer.engine_index = count; > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EEXIST); > > + > > + balancer.engine_index = count + 1; > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EINVAL); > > + > > + balancer.engine_index = 0; > > + gem_context_set_param(i915, &p); > > + > > + balancer.base.next_extension = to_user_pointer(&balancer); > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EEXIST); > > + > > + balancer.base.next_extension = -1ull; > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT); > > + > > + handle = gem_create(i915, 4096 * 3); > > + ptr = gem_mmap__gtt(i915, handle, 4096 * 3, PROT_WRITE); > > + gem_close(i915, handle); > > + > > + memset(&engines, 0, sizeof(engines)); > > + engines.engines[0].engine_class = I915_ENGINE_CLASS_INVALID; > > + engines.engines[0].engine_instance = I915_ENGINE_CLASS_INVALID_NONE; > > + engines.engines[1].engine_class = I915_ENGINE_CLASS_INVALID; > > + engines.engines[1].engine_instance = I915_ENGINE_CLASS_INVALID_NONE; > > + memcpy(engines.engines + 2, ci, count * sizeof(ci)); > > + p.size = (sizeof(struct i915_context_param_engines) + > > + (count + 2) * sizeof(*engines.engines)); > > Alignment again. > > > + gem_context_set_param(i915, &p); > > + > > + balancer.base.next_extension = 0; > > + balancer.engine_index = 1; > > + engines.extensions = to_user_pointer(&balancer); > > + gem_context_set_param(i915, &p); > > + > > + memcpy(ptr + 4096 - 8, &balancer, sizeof(balancer)); > > + memcpy(ptr + 8192 - 8, &balancer, sizeof(balancer)); > > + balancer.engine_index = 0; > > + > > + engines.extensions = to_user_pointer(ptr) + 4096 - 8; > > + gem_context_set_param(i915, &p); > > + > > + balancer.base.next_extension = engines.extensions; > > + engines.extensions = to_user_pointer(&balancer); > > + gem_context_set_param(i915, &p); > > mmap_gtt and unmapped area testing in one? Neighbouring. > > + munmap(ptr, 4096); >+ igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT); > > + engines.extensions = to_user_pointer(ptr) + 4096 - 8; > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT); > > + > > + engines.extensions = to_user_pointer(ptr) + 8192 - 8; > > + gem_context_set_param(i915, &p); > > + > > + balancer.base.next_extension = engines.extensions; > > + engines.extensions = to_user_pointer(&balancer); > > + gem_context_set_param(i915, &p); > > + > > + munmap(ptr + 8192, 4096); > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT); > > + engines.extensions = to_user_pointer(ptr) + 8192 - 8; > > + igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT); > > + > > + munmap(ptr + 4096, 4096); > > + > > + gem_context_destroy(i915, p.ctx_id); > > + free(ci); > > + } > > +} > > + > > +static void kick_kthreads(int period_us) > > +{ > > + sched_yield(); > > + usleep(period_us); > > yield and sleep hm.. calling with zero period_us? Doesn't seem like it. > So what's it about? Historically yield may have been a no-op, but sleep(0) actually yielded. > > +} > > + > > +static double measure_load(int pmu, int period_us) > > +{ > > + uint64_t data[2]; > > + uint64_t d_t, d_v; > > + > > + kick_kthreads(period_us); > > + > > + igt_assert_eq(read(pmu, data, sizeof(data)), sizeof(data)); > > + d_v = -data[0]; > > + d_t = -data[1]; > > + > > + usleep(period_us); > > + > > + igt_assert_eq(read(pmu, data, sizeof(data)), sizeof(data)); > > + d_v += data[0]; > > + d_t += data[1]; > > This -val + val trick with uint64_t works? Yes, unsigned overflow is defined. > > > + > > + return d_v / (double)d_t; > > +} > > + > > +static double measure_min_load(int pmu, unsigned int num, int period_us) > > +{ > > + uint64_t data[2 + num]; > > + uint64_t d_t, d_v[num]; > > + uint64_t min = -1, max = 0; > > + > > + kick_kthreads(period_us); > > + > > + igt_assert_eq(read(pmu, data, sizeof(data)), sizeof(data)); > > + for (unsigned int n = 0; n < num; n++) > > + d_v[n] = -data[2 + n]; > > + d_t = -data[1]; > > + > > + usleep(period_us); > > + > > + igt_assert_eq(read(pmu, data, sizeof(data)), sizeof(data)); > > + > > + d_t += data[1]; > > + for (unsigned int n = 0; n < num; n++) { > > + d_v[n] += data[2 + n]; > > + igt_debug("engine[%d]: %.1f%%\n", > > + n, d_v[n] / (double)d_t * 100); > > + if (d_v[n] < min) > > + min = d_v[n]; > > + if (d_v[n] > max) > > + max = d_v[n]; > > + } > > + > > + igt_debug("elapsed: %"PRIu64"ns, load [%.1f, %.1f]%%\n", > > + d_t, min / (double)d_t * 100, max / (double)d_t * 100); > > + > > + return min / (double)d_t; > > +} > > + > > +static void check_individual_engine(int i915, > > + uint32_t ctx, > > + const struct i915_engine_class_instance *ci, > > + int idx) > > +{ > > + igt_spin_t *spin; > > + double load; > > + int pmu; > > + > > + pmu = perf_i915_open(I915_PMU_ENGINE_BUSY(ci[idx].engine_class, > > + ci[idx].engine_instance)); > > + > > + spin = igt_spin_new(i915, .ctx = ctx, .engine = idx + 1); > > + load = measure_load(pmu, 10000); > > Hm usleep before start of measuring and between two samples is the same. > The one before should be fixed I think, no? Could be, that would require thought as to what the appropriate period for kicking should be. Yay for ksoftirqd. > > + igt_spin_free(i915, spin); > > + > > + close(pmu); > > + > > + igt_assert_f(load > 0.90, > > + "engine %d (class:instance %d:%d) was found to be only %.1f%% busy\n", > > + idx, ci[idx].engine_class, ci[idx].engine_instance, load*100); > > +} > > + > > +static void individual(int i915) > > +{ > > + uint32_t ctx; > > + > > + /* > > + * I915_CONTEXT_PARAM_ENGINE allows us to index into the user > > + * supplied array from gem_execbuf(). Our check is to build the > > + * ctx->engine[] with various different engine classes, feed in > > + * a spinner and then ask pmu to confirm it the expected engine > > + * was busy. > > + */ > > + > > + ctx = gem_context_create(i915); > > + > > + for (int mask = 0; mask < 32; mask++) { > > + struct i915_engine_class_instance *ci; > > + unsigned int count; > > + > > + ci = list_engines(i915, 1u << mask, &count); > > + if (!ci) > > + continue; > > + > > + igt_debug("Found %d engines of class %d\n", count, mask); > > + > > + for (int pass = 0; pass < count; pass++) { /* approx. count! */ > > + igt_permute_array(ci, count, igt_exchange_int64); > > struct i915_engine_class_instance is four bytes long, so swap func looks > wrong. Unless for some reason you want to swap in blocks of two. Don't > know. Last index would grab into random memory though. I must be missing > something or it wouldn't have worked.. Once upon a time class_instance was 2xu32. > > > + set_load_balancer(i915, ctx, ci, count); > > + for (unsigned int n = 0; n < count; n++) > > + check_individual_engine(i915, ctx, ci, n); > > + } > > + > > + free(ci); > > + } > > + > > + gem_context_destroy(i915, ctx); > > + gem_quiescent_gpu(i915); > > +} > > + > > +static void indicies(int i915) > > indices? > > > +{ > > + I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, I915_EXEC_RING_MASK + 1); > > + struct drm_i915_gem_context_param p = { > > + .ctx_id = gem_context_create(i915), > > + .param = I915_CONTEXT_PARAM_ENGINES, > > + .value = to_user_pointer(&engines) > > + }; > > + > > + struct drm_i915_gem_exec_object2 batch = { > > + .handle = batch_create(i915), > > + }; > > + > > + unsigned int nengines = 0; > > + void *balancers = NULL; > > + > > + /* > > + * We can populate our engine map with multiple virtual engines. > > + * Do so. > > + */ > > + > > + for (int class = 0; class < 32; class++) { > > + struct i915_engine_class_instance *ci; > > + unsigned int count; > > + > > + ci = list_engines(i915, 1u << class, &count); > > + if (!ci) > > + continue; > > + > > + igt_debug("Found %d engines of class %d\n", count, class); > > Maybe this debug message should go into list_engines, since it seems > repeated a few times already. Or remove the debug, I hear you. > > + > > + for (int n = 0; n < count; n++) { > > + I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(*balancer, > > + count); > > + > > + engines.engines[nengines].engine_class = > > + I915_ENGINE_CLASS_INVALID; > > + engines.engines[nengines].engine_instance = > > + I915_ENGINE_CLASS_INVALID_NONE; > > + > > + balancer = calloc(sizeof(*balancer), 1); > > + igt_assert(balancer); > > + > > + balancer->base.name = > > + I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE; > > + balancer->base.next_extension = > > + to_user_pointer(balancers); > > + balancers = balancer; > > + > > + balancer->engine_index = nengines++; > > + balancer->num_siblings = count; > > + > > + memcpy(balancer->engines, > > + ci, count * sizeof(*ci)); > > + } > > + free(ci); > > + } > > + > > + igt_require(balancers); > > + engines.extensions = to_user_pointer(balancers); > > + p.size = (sizeof(struct i915_engine_class_instance) * nengines + > > + sizeof(struct i915_context_param_engines)); > > + gem_context_set_param(i915, &p); > > + > > + for (unsigned int n = 0; n < nengines; n++) { > > + struct drm_i915_gem_execbuffer2 eb = { > > + .buffers_ptr = to_user_pointer(&batch), > > + .buffer_count = 1, > > + .flags = n, > > + .rsvd1 = p.ctx_id, > > + }; > > + igt_debug("Executing on index=%d\n", n); > > + gem_execbuf(i915, &eb); > > + } > > + gem_context_destroy(i915, p.ctx_id); > > + > > + gem_sync(i915, batch.handle); > > + gem_close(i915, batch.handle); > > + > > + while (balancers) { > > + struct i915_context_engines_load_balance *b, *n; > > + > > + b = balancers; > > + n = from_user_pointer(b->base.next_extension); > > + free(b); > > + > > + balancers = n; > > + } > > + > > + gem_quiescent_gpu(i915); > > +} > > + > > +static void busy(int i915) > > +{ > > + uint32_t scratch = gem_create(i915, 4096); > > + > > + /* > > + * Check that virtual engines are reported via GEM_BUSY. > > + * > > + * When running, the batch will be on the real engine and report > > + * the actual class. > > + * > > + * Prior to running, if the load-balancer is across multiple > > + * classes we don't know which engine the batch will > > + * execute on, so we report them all! > > + * > > + * However, as we only support (and test) creating a load-balancer > > + * from engines of only one class, that can be propagated accurately > > + * through to GEM_BUSY. > > + */ > > + > > + for (int class = 0; class < 16; class++) { > > + struct drm_i915_gem_busy busy; > > + struct i915_engine_class_instance *ci; > > + unsigned int count; > > + igt_spin_t *spin[2]; > > + uint32_t ctx; > > + > > + ci = list_engines(i915, 1u << class, &count); > > + if (!ci) > > + continue; > > + > > + igt_debug("Found %d engines of class %d\n", count, class); > > + ctx = load_balancer_create(i915, ci, count); > > + free(ci); > > + > > + spin[0] = __igt_spin_new(i915, > > + .ctx = ctx, > > + .flags = IGT_SPIN_POLL_RUN); > > + spin[1] = __igt_spin_new(i915, > > + .ctx = ctx, > > + .dependency = scratch); > > + > > + igt_spin_busywait_until_started(spin[0]); > > + > > + /* Running: actual class */ > > + busy.handle = spin[0]->handle; > > + do_ioctl(i915, DRM_IOCTL_I915_GEM_BUSY, &busy); > > + igt_assert_eq_u32(busy.busy, 1u << (class + 16)); > > + > > + /* Queued(read): expected class */ > > + busy.handle = spin[1]->handle; > > + do_ioctl(i915, DRM_IOCTL_I915_GEM_BUSY, &busy); > > + igt_assert_eq_u32(busy.busy, 1u << (class + 16)); > > + > > + /* Queued(write): expected class */ > > + busy.handle = scratch; > > + do_ioctl(i915, DRM_IOCTL_I915_GEM_BUSY, &busy); > > + igt_assert_eq_u32(busy.busy, > > + (1u << (class + 16)) | (class + 1)); > > + > > + igt_spin_free(i915, spin[1]); > > + igt_spin_free(i915, spin[0]); > > + > > + gem_context_destroy(i915, ctx); > > + } > > + > > + gem_close(i915, scratch); > > + gem_quiescent_gpu(i915); > > +} > > + > > +static int add_pmu(int pmu, const struct i915_engine_class_instance *ci) > > +{ > > + return perf_i915_open_group(I915_PMU_ENGINE_BUSY(ci->engine_class, > > + ci->engine_instance), > > + pmu); > > +} > > + > > +static void full(int i915, unsigned int flags) > > +#define PULSE 0x1 > > +#define LATE 0x2 > > +{ > > + struct drm_i915_gem_exec_object2 batch = { > > + .handle = batch_create(i915), > > + }; > > + > > + if (flags & LATE) > > + igt_require_sw_sync(); > > + > > + /* > > + * I915_CONTEXT_PARAM_ENGINE changes the meaning of I915_EXEC_DEFAULT > > + * to provide an automatic selection from the ctx->engine[]. It > > + * employs load-balancing to evenly distribute the workload the > > The leading section needs rewritting for truth. It is the load balance > extensions which _can_ redefine the meanign of I915_EXEC_DEFAULT etc.. > I'm sure I didn't need to explain, but have just to make it clear which > part I am complaining about. :) Hey, remember this is 2018! > > + * array. If we submit N spinners, we expect them to be simultaneously > > + * running across N engines and use PMU to confirm that the entire > > + * set of engines are busy. > > Clarify it is only if using N contexts. > > > + * > > + * We complicate matters by interpersing shortlived tasks to challenge > > + * the kernel to search for space in which to insert new batches. > > + */ > > + > > + > > + for (int mask = 0; mask < 32; mask++) { > > + struct i915_engine_class_instance *ci; > > + igt_spin_t *spin = NULL; > > + IGT_CORK_FENCE(cork); > > + unsigned int count; > > + double load; > > + int fence = -1; > > + int *pmu; > > + > > + ci = list_engines(i915, 1u << mask, &count); > > + if (!ci) > > + continue; > > + > > + igt_debug("Found %d engines of class %d\n", count, mask); > > + > > + pmu = malloc(sizeof(*pmu) * count); > > + igt_assert(pmu); > > + > > + if (flags & LATE) > > + fence = igt_cork_plug(&cork, i915); > > + > > + pmu[0] = -1; > > + for (unsigned int n = 0; n < count; n++) { > > + uint32_t ctx; > > + > > + pmu[n] = add_pmu(pmu[0], &ci[n]); > > + > > + if (flags & PULSE) { > > + struct drm_i915_gem_execbuffer2 eb = { > > + .buffers_ptr = to_user_pointer(&batch), > > + .buffer_count = 1, > > + .rsvd2 = fence, > > + .flags = flags & LATE ? I915_EXEC_FENCE_IN : 0, > > + }; > > + gem_execbuf(i915, &eb); > > + } > > + > > + /* > > + * Each spinner needs to be one a new timeline, > > + * otherwise they will just sit in the single queue > > + * and not run concurrently. > > + */ > > + ctx = load_balancer_create(i915, ci, count); > > + > > + if (spin == NULL) { > > + spin = __igt_spin_new(i915, .ctx = ctx); > > + } else { > > + struct drm_i915_gem_execbuffer2 eb = { > > + .buffers_ptr = spin->execbuf.buffers_ptr, > > + .buffer_count = spin->execbuf.buffer_count, > > + .rsvd1 = ctx, > > + .rsvd2 = fence, > > + .flags = flags & LATE ? I915_EXEC_FENCE_IN : 0, > > + }; > > + gem_execbuf(i915, &eb); > > + } > > + > > + gem_context_destroy(i915, ctx); > > + } > > + > > + if (flags & LATE) { > > + igt_cork_unplug(&cork); > > + close(fence); > > + } > > + > > + load = measure_min_load(pmu[0], count, 10000); > > + igt_spin_free(i915, spin); > > + > > + close(pmu[0]); > > + free(pmu); > > + > > + free(ci); > > + > > + igt_assert_f(load > 0.90, > > + "minimum load for %d x class:%d was found to be only %.1f%% busy\n", > > + count, mask, load*100); > > + gem_quiescent_gpu(i915); > > + } > > + > > + gem_close(i915, batch.handle); > > + gem_quiescent_gpu(i915); > > +} > > + > > +static void nop(int i915) > > +{ > > + struct drm_i915_gem_exec_object2 batch = { > > + .handle = batch_create(i915), > > + }; > > + > > + for (int mask = 0; mask < 32; mask++) { > > s/mask/class/ > > > + struct i915_engine_class_instance *ci; > > + unsigned int count; > > + uint32_t ctx; > > + > > + ci = list_engines(i915, 1u << mask, &count); > > + if (!ci) > > + continue; > > + > > + if (count < 2) { > > + free(ci); > > + continue; > > Benchamrk only subtest for real veng? Sure, that's a bit of internal knowledge leaking. > > + } > > + > > + igt_debug("Found %d engines of class %d\n", count, mask); > > + ctx = load_balancer_create(i915, ci, count); > > + > > + for (int n = 0; n < count; n++) { > > + struct drm_i915_gem_execbuffer2 execbuf = { > > + .buffers_ptr = to_user_pointer(&batch), > > + .buffer_count = 1, > > + .flags = n + 1, > > + .rsvd1 = ctx, > > + }; > > + struct timespec tv = {}; > > + unsigned long nops; > > + double t; > > + > > + igt_nsec_elapsed(&tv); > > + nops = 0; > > + do { > > + for (int r = 0; r < 1024; r++) > > + gem_execbuf(i915, &execbuf); > > + nops += 1024; > > + } while (igt_seconds_elapsed(&tv) < 2); > > + gem_sync(i915, batch.handle); > > + > > + t = igt_nsec_elapsed(&tv) * 1e-3 / nops; > > + igt_info("%x:%d %.3fus\n", mask, n, t); > > Class in decimal is better I think. But it's mask :-p It's treated as just a number and not as a class identifier. > And some descriptive labels to info messages would be good. Like > "individual engines", "virtual engine" etc. It does describe the individual engines and their composites. The output looks clear and concise. You may want mask translated to a string... but this code is oblivious as to what mask actually is. The way it is used definitely looks more like mask than class. > > + } > > + > > + { > > + struct drm_i915_gem_execbuffer2 execbuf = { > > + .buffers_ptr = to_user_pointer(&batch), > > + .buffer_count = 1, > > + .rsvd1 = ctx, > > + }; > > + struct timespec tv = {}; > > + unsigned long nops; > > + double t; > > + > > + igt_nsec_elapsed(&tv); > > + nops = 0; > > + do { > > + for (int r = 0; r < 1024; r++) > > + gem_execbuf(i915, &execbuf); > > + nops += 1024; > > + } while (igt_seconds_elapsed(&tv) < 2); > > + gem_sync(i915, batch.handle); > > + > > + t = igt_nsec_elapsed(&tv) * 1e-3 / nops; > > + igt_info("%x:* %.3fus\n", mask, t); > > + } > > + > > + > > + igt_fork(child, count) { > > + struct drm_i915_gem_execbuffer2 execbuf = { > > + .buffers_ptr = to_user_pointer(&batch), > > + .buffer_count = 1, > > + .flags = child + 1, > > + .rsvd1 = gem_context_clone(i915, ctx, > > + I915_CONTEXT_CLONE_ENGINES, 0), > > + }; > > + struct timespec tv = {}; > > + unsigned long nops; > > + double t; > > + > > + igt_nsec_elapsed(&tv); > > + nops = 0; > > + do { > > + for (int r = 0; r < 1024; r++) > > + gem_execbuf(i915, &execbuf); > > + nops += 1024; > > + } while (igt_seconds_elapsed(&tv) < 2); > > + gem_sync(i915, batch.handle); > > + > > + t = igt_nsec_elapsed(&tv) * 1e-3 / nops; > > + igt_info("[%d] %x:%d %.3fus\n", child, mask, child, t); > > + > > + memset(&tv, 0, sizeof(tv)); > > + execbuf.flags = 0; > > + > > + igt_nsec_elapsed(&tv); > > + nops = 0; > > + do { > > + for (int r = 0; r < 1024; r++) > > + gem_execbuf(i915, &execbuf); > > + nops += 1024; > > + } while (igt_seconds_elapsed(&tv) < 2); > > + gem_sync(i915, batch.handle); > > + > > + t = igt_nsec_elapsed(&tv) * 1e-3 / nops; > > + igt_info("[%d] %x:* %.3fus\n", child, mask, t); > > + > > + gem_context_destroy(i915, execbuf.rsvd1); > > + } > > + > > + igt_waitchildren(); > > + > > + gem_context_destroy(i915, ctx); > > + free(ci); > > + } > > + > > + gem_close(i915, batch.handle); > > + gem_quiescent_gpu(i915); > > +} > > + > > +static void ping(int i915, uint32_t ctx, unsigned int engine) > > +{ > > + struct drm_i915_gem_exec_object2 obj = { > > + .handle = batch_create(i915), > > + }; > > + struct drm_i915_gem_execbuffer2 execbuf = { > > + .buffers_ptr = to_user_pointer(&obj), > > + .buffer_count = 1, > > + .flags = engine, > > + .rsvd1 = ctx, > > + }; > > + gem_execbuf(i915, &execbuf); > > + gem_sync(i915, obj.handle); > > + gem_close(i915, obj.handle); > > +} > > + > > +static void semaphore(int i915) > > +{ > > + uint32_t block[2], scratch; > > + igt_spin_t *spin[3]; > > + > > + /* > > + * If we are using HW semaphores to launch serialised requests > > + * on different engine concurrently, we want to verify that real > > + * work is unimpeded. > > + */ > > + igt_require(gem_scheduler_has_preemption(i915)); > > + > > + block[0] = gem_context_create(i915); > > + block[1] = gem_context_create(i915); > > + > > + scratch = gem_create(i915, 4096); > > + spin[2] = igt_spin_new(i915, .dependency = scratch); > > + for (int mask = 1; mask < 32; mask++) { > > s/mask/class/ throughout. > > > + struct i915_engine_class_instance *ci; > > + unsigned int count; > > + uint32_t vip; > > + > > + ci = list_engines(i915, 1u << mask, &count); > > + if (!ci) > > + continue; > > + > > + if (count < ARRAY_SIZE(block)) > > + continue; > > + > > + /* Ensure that we completely occupy all engines in this group */ > > + count = ARRAY_SIZE(block); > > + > > + for (int i = 0; i < count; i++) { > > + set_load_balancer(i915, block[i], ci, count); > > + spin[i] = __igt_spin_new(i915, > > + .ctx = block[i], > > + .dependency = scratch); > > Alignment. > > > + } > > + > > + /* > > + * Either we haven't blocked both engines with semaphores, > > + * or we let the vip through. If not, we hang. > > + */ > > + vip = gem_context_create(i915); > > + set_load_balancer(i915, vip, ci, count); > > + ping(i915, vip, 0); > > + gem_context_destroy(i915, vip); > > + > > + for (int i = 0; i < count; i++) > > + igt_spin_free(i915, spin[i]); > > + > > + free(ci); > > + } > > + igt_spin_free(i915, spin[2]); > > + gem_close(i915, scratch); > > + > > + gem_context_destroy(i915, block[1]); > > + gem_context_destroy(i915, block[0]); > > + > > + gem_quiescent_gpu(i915); > > +} > > + > > +static void smoketest(int i915, int timeout) > > +{ > > + struct drm_i915_gem_exec_object2 batch[2] = { > > + { .handle = __batch_create(i915, 16380) } > > + }; > > + unsigned int ncontext = 0; > > + uint32_t *contexts = NULL; > > + uint32_t *handles = NULL; > > + > > + igt_require_sw_sync(); > > + > > + for (int mask = 0; mask < 32; mask++) { > > + struct i915_engine_class_instance *ci; > > + unsigned int count = 0; > > + > > + ci = list_engines(i915, 1u << mask, &count); > > + if (!ci || count < 2) { > > + free(ci); > > + continue; > > + } > > + > > + igt_debug("Found %d engines of class %d\n", count, mask); > > + > > + ncontext += 128; > > + contexts = realloc(contexts, sizeof(*contexts) * ncontext); > > + igt_assert(contexts); > > + > > + for (unsigned int n = ncontext - 128; n < ncontext; n++) { > > + contexts[n] = load_balancer_create(i915, ci, count); > > + igt_assert(contexts[n]); > > + } > > + > > + free(ci); > > + } > > + igt_debug("Created %d virtual engines (one per context)\n", ncontext); > > + igt_require(ncontext); > > + > > + contexts = realloc(contexts, sizeof(*contexts) * ncontext * 4); > > + igt_assert(contexts); > > + memcpy(contexts + ncontext, contexts, ncontext * sizeof(*contexts)); > > + ncontext *= 2; > > + memcpy(contexts + ncontext, contexts, ncontext * sizeof(*contexts)); > > + ncontext *= 2; > > + > > + handles = malloc(sizeof(*handles) * ncontext); > > + igt_assert(handles); > > + for (unsigned int n = 0; n < ncontext; n++) > > + handles[n] = gem_create(i915, 4096); > > + > > + igt_until_timeout(timeout) { > > + unsigned int count = 1 + (rand() % (ncontext - 1)); > > + IGT_CORK_FENCE(cork); > > + int fence = igt_cork_plug(&cork, i915); > > + > > + for (unsigned int n = 0; n < count; n++) { > > + struct drm_i915_gem_execbuffer2 eb = { > > + .buffers_ptr = to_user_pointer(batch), > > + .buffer_count = ARRAY_SIZE(batch), > > + .rsvd1 = contexts[n], > > + .rsvd2 = fence, > > + .flags = I915_EXEC_BATCH_FIRST | I915_EXEC_FENCE_IN, > > + }; > > + batch[1].handle = handles[n]; > > + gem_execbuf(i915, &eb); > > + } > > + igt_permute_array(handles, count, igt_exchange_int); > > + > > + igt_cork_unplug(&cork); > > + for (unsigned int n = 0; n < count; n++) > > + gem_sync(i915, handles[n]); > > + > > + close(fence); > > + } > > + > > + for (unsigned int n = 0; n < ncontext; n++) { > > + gem_close(i915, handles[n]); > > + __gem_context_destroy(i915, contexts[n]); > > + } > > + free(handles); > > + free(contexts); > > + gem_close(i915, batch[0].handle); > > +} > > + > > +static bool has_context_engines(int i915) > > +{ > > + struct drm_i915_gem_context_param p = { > > + .param = I915_CONTEXT_PARAM_ENGINES, > > + }; > > + > > + return __gem_context_set_param(i915, &p) == 0; > > +} > > + > > +static bool has_load_balancer(int i915) > > +{ > > + struct i915_engine_class_instance ci = {}; > > + uint32_t ctx; > > + int err; > > + > > + ctx = gem_context_create(i915); > > + err = __set_load_balancer(i915, ctx, &ci, 1); > > + gem_context_destroy(i915, ctx); > > + > > + return err == 0; > > +} > > + > > +igt_main > > +{ > > + int i915 = -1; > > + > > + igt_skip_on_simulation(); > > + > > + igt_fixture { > > + i915 = drm_open_driver(DRIVER_INTEL); > > + igt_require_gem(i915); > > + > > + gem_require_contexts(i915); > > + igt_require(has_context_engines(i915)); > > + igt_require(has_load_balancer(i915)); > > + > > + igt_fork_hang_detector(i915); > > + } > > + > > + igt_subtest("invalid-balancer") > > + invalid_balancer(i915); > > + > > + igt_subtest("individual") > > + individual(i915); > > + > > + igt_subtest("indicies") > > + indicies(i915); > > + > > + igt_subtest("busy") > > + busy(i915); > > + > > + igt_subtest_group { > > + static const struct { > > + const char *name; > > + unsigned int flags; > > + } phases[] = { > > + { "", 0 }, > > + { "-pulse", PULSE }, > > + { "-late", LATE }, > > + { "-late-pulse", PULSE | LATE }, > > + { } > > + }; > > + for (typeof(*phases) *p = phases; p->name; p++) > > + igt_subtest_f("full%s", p->name) > > + full(i915, p->flags); > > + } > > + > > + igt_subtest("nop") > > + nop(i915); > > + > > + igt_subtest("semaphore") > > + semaphore(i915); > > + > > + igt_subtest("smoke") > > + smoketest(i915, 20); > > + > > + igt_fixture { > > + igt_stop_hang_detector(); > > + } > > +} > > diff --git a/tests/meson.build b/tests/meson.build > > index 7e0089e74..eeea3611d 100644 > > --- a/tests/meson.build > > +++ b/tests/meson.build > > @@ -288,6 +288,13 @@ test_executables += executable('gem_eio', > > install : true) > > test_list += 'gem_eio' > > > > +test_executables += executable('gem_exec_balancer', 'i915/gem_exec_balancer.c', > > + dependencies : test_deps + [ lib_igt_perf ], > > + install_dir : libexecdir, > > + install_rpath : libexecdir_rpathdir, > > + install : true) > > +test_progs += 'gem_exec_balancer' > > + > > test_executables += executable('gem_mocs_settings', > > join_paths('i915', 'gem_mocs_settings.c'), > > dependencies : test_deps + [ lib_igt_perf ], > > > > Regards, > > Tvrtko > _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx