Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Measure the baseline latency between contexts in order to directly > compare that with the additional cost of preemption. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > --- > tests/i915/gem_exec_latency.c | 230 ++++++++++++++++++++++++++++++++++ > 1 file changed, 230 insertions(+) > > diff --git a/tests/i915/gem_exec_latency.c b/tests/i915/gem_exec_latency.c > index e56d62780..e88fbbc6a 100644 > --- a/tests/i915/gem_exec_latency.c > +++ b/tests/i915/gem_exec_latency.c > @@ -410,6 +410,86 @@ static void latency_from_ring(int fd, > } > } > > +static void execution_latency(int i915, unsigned int ring, const char *name) > +{ > + struct drm_i915_gem_exec_object2 obj = { > + .handle = gem_create(i915, 4095), > + }; > + struct drm_i915_gem_execbuffer2 execbuf = { > + .buffers_ptr = to_user_pointer(&obj), > + .buffer_count = 1, > + .flags = ring | LOCAL_I915_EXEC_NO_RELOC | LOCAL_I915_EXEC_HANDLE_LUT, > + }; > + const unsigned int mmio_base = 0x2000; > + const unsigned int cs_timestamp = mmio_base + 0x358; > + volatile uint32_t *timestamp; > + uint32_t *cs, *result; > + > + timestamp = > + (volatile uint32_t *)((volatile char *)igt_global_mmio + cs_timestamp); > + > + obj.handle = gem_create(i915, 4096); > + obj.flags = EXEC_OBJECT_PINNED; > + result = gem_mmap__wc(i915, obj.handle, 0, 4096, PROT_WRITE); > + > + for (int i = 0; i < 16; i++) { > + cs = result + 16 * i; > + *cs++ = 0x24 << 23 | 2; /* SRM */ > + *cs++ = cs_timestamp; > + *cs++ = 4096 - 16 * 4 + i * 4; > + *cs++ = 0; > + *cs++ = 0xa << 23; Why not MI_BATCH_BUFFER_END? To emphasize that we have multiple batches inside a bo? > + } > + > + cs = result + 1024 - 16; > + > + for (int length = 2; length <= 16; length <<= 1) { > + struct igt_mean submit, batch, total; > + int last = length - 1; > + > + igt_mean_init(&submit); > + igt_mean_init(&batch); > + igt_mean_init(&total); > + > + igt_until_timeout(2) { > + uint32_t now, end; > + > + cs[last] = 0; > + > + now = *timestamp; > + for (int i = 0; i < length; i++) { > + execbuf.batch_start_offset = 64 * i; > + gem_execbuf(i915, &execbuf); > + } > + while (!((volatile uint32_t *)cs)[last]) > + ; > + end = *timestamp; > + > + igt_mean_add(&submit, (cs[0] - now) * rcs_clock); > + igt_mean_add(&batch, (cs[last] - cs[0]) * rcs_clock / last); Just curious of what do you use of inter batch latency? Oh and do we need to to take the rcs_clock resolution into account on result calculation. Prolly not as it seems to be ticking fast enough for 0.1us accuracy. > + igt_mean_add(&total, (end - now) * rcs_clock); > + } > + > + igt_info("%sx%d Submission latency: %.2f±%.2fus\n", > + name, length, > + 1e-3 * igt_mean_get(&submit), > + 1e-3 * sqrt(igt_mean_get_variance(&submit))); > + > + igt_info("%sx%d Inter-batch latency: %.2f±%.2fus\n", > + name, length, > + 1e-3 * igt_mean_get(&batch), > + 1e-3 * sqrt(igt_mean_get_variance(&batch))); > + > + igt_info("%sx%d End-to-end latency: %.2f±%.2fus\n", > + name, length, > + 1e-3 * igt_mean_get(&total), > + 1e-3 * sqrt(igt_mean_get_variance(&total))); > + } > + > + munmap(result, 4096); > + gem_close(i915, obj.handle); > +} > + > static void > __submit_spin(int fd, igt_spin_t *spin, unsigned int flags) > { > @@ -616,6 +696,142 @@ rthog_latency_on_ring(int fd, unsigned int engine, const char *name, unsigned in > munmap(results, MMAP_SZ); > } > > +static void context_switch(int i915, > + unsigned int engine, const char *name, > + unsigned int flags) > +{ > + struct drm_i915_gem_exec_object2 obj[2]; > + struct drm_i915_gem_relocation_entry reloc[5]; > + struct drm_i915_gem_execbuffer2 eb; > + uint32_t *cs, *bbe, *results, v; > + unsigned int mmio_base; > + struct igt_mean mean; > + uint32_t ctx[2]; > + > + /* XXX i915_query()! */ > + switch (engine) { > + case I915_EXEC_DEFAULT: > + case I915_EXEC_RENDER: > + mmio_base = 0x2000; > + break; > +#if 0 > + case I915_EXEC_BSD: > + mmio_base = 0x12000; > + break; > +#endif It seems to be there according to bspec. Timestamps don't work with it? > + case I915_EXEC_BLT: > + mmio_base = 0x22000; > + break; > + > + case I915_EXEC_VEBOX: > + if (intel_gen(intel_get_drm_devid(i915)) >= 11) > + mmio_base = 0x1d8000; > + else > + mmio_base = 0x1a000; > + break; > + > + default: > + igt_skip("mmio base not known\n"); > + } > + > + for (int i = 0; i < ARRAY_SIZE(ctx); i++) > + ctx[i] = gem_context_create(i915); > + > + if (flags & PREEMPT) { > + gem_context_set_priority(i915, ctx[0], -1023); > + gem_context_set_priority(i915, ctx[1], +1023); > + } > + > + memset(obj, 0, sizeof(obj)); > + obj[0].handle = gem_create(i915, 4096); > + gem_set_caching(i915, obj[0].handle, 1); > + results = gem_mmap__cpu(i915, obj[0].handle, 0, 4096, PROT_READ); > + gem_set_domain(i915, obj[0].handle, I915_GEM_DOMAIN_CPU, 0); > + > + obj[1].handle = gem_create(i915, 4096); > + memset(reloc,0, sizeof(reloc)); > + obj[1].relocation_count = ARRAY_SIZE(reloc); > + obj[1].relocs_ptr = to_user_pointer(reloc); > + bbe = gem_mmap__wc(i915, obj[1].handle, 0, 4096, PROT_WRITE); > + gem_set_domain(i915, obj[1].handle, > + I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC); > + > + cs = bbe; > + *cs++ = 0x5 << 23; /* ARB_CHK */ or even better MI_ARB_CHK; > + *cs++ = 0x24 << 23 | 2; /* SRM */ > + *cs++ = mmio_base + 0x358; /* TIMESTAMP */ > + reloc[0].target_handle = obj[0].handle; > + reloc[0].offset = (cs - bbe) * sizeof(*cs); > + *cs++ = 0; > + *cs++ = 0; > + *cs++ = MI_BATCH_BUFFER_START | 1; > + reloc[1].target_handle = obj[1].handle; > + reloc[1].offset = (cs - bbe) * sizeof(*cs); > + *cs++ = 0; > + *cs++ = 0; > + > + cs = bbe + 64; > + *cs++ = 0x24 << 23 | 2; /* SRM */ > + *cs++ = mmio_base + 0x358; /* TIMESTAMP */ > + reloc[2].target_handle = obj[0].handle; > + reloc[2].offset = (cs - bbe) * sizeof(*cs); > + *cs++ = reloc[2].delta = 4; > + *cs++ = 0; > + *cs++ = 0x29 << 23 | 2; /* LRM */ > + *cs++ = mmio_base + 0x600; /* GPR0 */ > + reloc[3].target_handle = obj[0].handle; > + reloc[3].offset = (cs - bbe) * sizeof(*cs); > + *cs++ = 0; > + *cs++ = 0; > + *cs++ = 0x24 << 23 | 2; /* SRM */ > + *cs++ = mmio_base + 0x600; /* GPR0 */ > + reloc[4].target_handle = obj[0].handle; > + reloc[4].offset = (cs - bbe) * sizeof(*cs); > + *cs++ = reloc[4].delta = 8; > + *cs++ = 0; > + *cs++ = 0xa << 23; > + > + memset(&eb, 0, sizeof(eb)); > + eb.buffers_ptr = to_user_pointer(obj); > + eb.buffer_count = ARRAY_SIZE(obj); > + eb.flags = engine; > + eb.flags |= LOCAL_I915_EXEC_NO_RELOC; > + > + v = 0; > + igt_mean_init(&mean); > + igt_until_timeout(5) { > + eb.rsvd1 = ctx[0]; > + eb.batch_start_offset = 0; > + gem_execbuf(i915, &eb); > + > + while (results[0] == v) > + igt_assert(gem_bo_busy(i915, obj[1].handle)); Is there a race there? Object will be busy short time after the result appears. > + > + eb.rsvd1 = ctx[1]; > + eb.batch_start_offset = 64 * sizeof(*cs); > + gem_execbuf(i915, &eb); > + > + *bbe = 0xa << 23; __sync_syncronize or are you relying on the corresponding syscalls? -Mika > + gem_sync(i915, obj[1].handle); > + *bbe = 0x5 << 23; > + > + v = results[0]; > + igt_mean_add(&mean, (results[1] - results[2]) * rcs_clock); > + } > + igt_info("%s context switch latency%s: %.2f±%.2fus\n", > + name, flags & PREEMPT ? " (preempt)" : "", > + 1e-3 * igt_mean_get(&mean), > + 1e-3 * sqrt(igt_mean_get_variance(&mean))); > + munmap(results, 4096); > + munmap(bbe, 4096); > + > + for (int i = 0; i < ARRAY_SIZE(obj); i++) > + gem_close(i915, obj[i].handle); > + > + for (int i = 0; i < ARRAY_SIZE(ctx); i++) > + gem_context_destroy(i915, ctx[i]); > +} > + > static double clockrate(int i915, int reg) > { > volatile uint32_t *mmio; > @@ -722,6 +938,11 @@ igt_main > e->name, > 0); > > + igt_subtest_f("%s-execution-latency", e->name) > + execution_latency(device, > + e->exec_id | e->flags, > + e->name); > + > igt_subtest_f("%s-live-dispatch-queued", e->name) > latency_on_ring(device, > e->exec_id | e->flags, > @@ -741,12 +962,21 @@ igt_main > e->exec_id | e->flags, > e->name, CORK); > > + igt_subtest_f("%s-cs", e->name) > + context_switch(device, > + e->exec_id | e->flags, > + e->name, 0); > igt_subtest_group { > igt_fixture { > gem_require_contexts(device); > igt_require(gem_scheduler_has_preemption(device)); > } > > + igt_subtest_f("%s-cs-preempt", e->name) > + context_switch(device, > + e->exec_id | e->flags, > + e->name, PREEMPT); > + > igt_subtest_f("%s-preemption", e->name) > latency_from_ring(device, > e->exec_id | e->flags, > -- > 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx