Re: [PATCH i-g-t 1/2] i915/gem_exec_latency: Measure the latency of context switching

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> · Thu, 06 Jun 2019 17:16:50 +0100

Quoting Mika Kuoppala (2019-06-05 16:04:09)
> Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes:
> 
> > Measure the baseline latency between contexts in order to directly
> > compare that with the additional cost of preemption.
> >
> > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
> > ---
> >  tests/i915/gem_exec_latency.c | 230 ++++++++++++++++++++++++++++++++++
> >  1 file changed, 230 insertions(+)
> >
> > diff --git a/tests/i915/gem_exec_latency.c b/tests/i915/gem_exec_latency.c
> > index e56d62780..e88fbbc6a 100644
> > --- a/tests/i915/gem_exec_latency.c
> > +++ b/tests/i915/gem_exec_latency.c
> > @@ -410,6 +410,86 @@ static void latency_from_ring(int fd,
> >       }
> >  }
> >  
> > +static void execution_latency(int i915, unsigned int ring, const char *name)
> > +{
> > +     struct drm_i915_gem_exec_object2 obj = {
> > +             .handle = gem_create(i915, 4095),
> > +     };
> > +     struct drm_i915_gem_execbuffer2 execbuf = {
> > +             .buffers_ptr = to_user_pointer(&obj),
> > +             .buffer_count = 1,
> > +             .flags = ring | LOCAL_I915_EXEC_NO_RELOC | LOCAL_I915_EXEC_HANDLE_LUT,
> > +     };
> > +     const unsigned int mmio_base = 0x2000;
> > +     const unsigned int cs_timestamp = mmio_base + 0x358;
> > +     volatile uint32_t *timestamp;
> > +     uint32_t *cs, *result;
> > +
> > +     timestamp =
> > +             (volatile uint32_t *)((volatile char *)igt_global_mmio + cs_timestamp);
> > +
> > +     obj.handle = gem_create(i915, 4096);
> > +     obj.flags = EXEC_OBJECT_PINNED;
> > +     result = gem_mmap__wc(i915, obj.handle, 0, 4096, PROT_WRITE);
> > +
> > +     for (int i = 0; i < 16; i++) {
> > +             cs = result + 16 * i;
> > +             *cs++ = 0x24 << 23 | 2; /* SRM */
> > +             *cs++ = cs_timestamp;
> > +             *cs++ = 4096 - 16 * 4 + i * 4;
> > +             *cs++ = 0;
> > +             *cs++ = 0xa << 23;
> 
> Why not MI_BATCH_BUFFER_END? To emphasize that we have
> multiple batches inside a bo?

I wrote MI_BATCH_BUFFER_END... Oh, I see, no, I was just using the
values for no other reason than habit.

> > +     }
> > +
> > +     cs = result + 1024 - 16;
> > +
> > +     for (int length = 2; length <= 16; length <<= 1) {
> > +             struct igt_mean submit, batch, total;
> > +             int last = length - 1;
> > +
> > +             igt_mean_init(&submit);
> > +             igt_mean_init(&batch);
> > +             igt_mean_init(&total);
> > +
> > +             igt_until_timeout(2) {
> > +                     uint32_t now, end;
> > +
> > +                     cs[last] = 0;
> > +
> > +                     now = *timestamp;
> > +                     for (int i = 0; i < length; i++) {
> > +                             execbuf.batch_start_offset = 64 * i;
> > +                             gem_execbuf(i915, &execbuf);
> > +                     }
> > +                     while (!((volatile uint32_t *)cs)[last])
> > +                             ;
> > +                     end = *timestamp;
> > +
> > +                     igt_mean_add(&submit, (cs[0] - now) * rcs_clock);
> > +                     igt_mean_add(&batch, (cs[last] - cs[0]) * rcs_clock / last);
> 
> Just curious of what do you use of inter batch latency?

EPARSE.

> Oh and do we need to to take the rcs_clock resolution into account
> on result calculation. Prolly not as it seems to be ticking fast enough
> for 0.1us accuracy.

It just provides the lower granularity to results. It doesn't alter the
calculations or presentation, just that quantum beneath which we cannot
measure.

Considering that the CS cannot go faster than its clock, it doesn't
affect anything.

> > +                     igt_mean_add(&total, (end - now) * rcs_clock);
> > +             }
> > +
> > +             igt_info("%sx%d Submission latency: %.2f±%.2fus\n",
> > +                      name, length,
> > +                      1e-3 * igt_mean_get(&submit),
> > +                      1e-3 * sqrt(igt_mean_get_variance(&submit)));
> > +
> > +             igt_info("%sx%d Inter-batch latency: %.2f±%.2fus\n",
> > +                      name, length,
> > +                      1e-3 * igt_mean_get(&batch),
> > +                      1e-3 * sqrt(igt_mean_get_variance(&batch)));
> > +
> > +             igt_info("%sx%d End-to-end latency: %.2f±%.2fus\n",
> > +                      name, length,
> > +                      1e-3 * igt_mean_get(&total),
> > +                      1e-3 * sqrt(igt_mean_get_variance(&total)));
> > +     }
> > +
> > +     munmap(result, 4096);
> > +     gem_close(i915, obj.handle);
> > +}
> > +
> >  static void
> >  __submit_spin(int fd, igt_spin_t *spin, unsigned int flags)
> >  {
> > @@ -616,6 +696,142 @@ rthog_latency_on_ring(int fd, unsigned int engine, const char *name, unsigned in
> >       munmap(results, MMAP_SZ);
> >  }
> >  
> > +static void context_switch(int i915,
> > +                        unsigned int engine, const char *name,
> > +                        unsigned int flags)
> > +{
> > +     struct drm_i915_gem_exec_object2 obj[2];
> > +     struct drm_i915_gem_relocation_entry reloc[5];
> > +     struct drm_i915_gem_execbuffer2 eb;
> > +     uint32_t *cs, *bbe, *results, v;
> > +     unsigned int mmio_base;
> > +     struct igt_mean mean;
> > +     uint32_t ctx[2];
> > +
> > +     /* XXX i915_query()! */
> > +     switch (engine) {
> > +     case I915_EXEC_DEFAULT:
> > +     case I915_EXEC_RENDER:
> > +             mmio_base = 0x2000;
> > +             break;
> > +#if 0
> > +     case I915_EXEC_BSD:
> > +             mmio_base = 0x12000;
> > +             break;
> > +#endif
> 
> It seems to be there according to bspec. Timestamps don't
> work with it?

Engine assignment is random. Old uapi is a mess.

> 
> > +     case I915_EXEC_BLT:
> > +             mmio_base = 0x22000;
> > +             break;
> > +
> > +     case I915_EXEC_VEBOX:
> > +             if (intel_gen(intel_get_drm_devid(i915)) >= 11)
> > +                     mmio_base = 0x1d8000;
> > +             else
> > +                     mmio_base = 0x1a000;
> > +             break;
> > +
> > +     default:
> > +             igt_skip("mmio base not known\n");
> > +     }
> > +
> > +     for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> > +             ctx[i] = gem_context_create(i915);
> > +
> > +     if (flags & PREEMPT) {
> > +             gem_context_set_priority(i915, ctx[0], -1023);
> > +             gem_context_set_priority(i915, ctx[1], +1023);
> > +     }
> > +
> > +     memset(obj, 0, sizeof(obj));
> > +     obj[0].handle = gem_create(i915, 4096);
> > +     gem_set_caching(i915, obj[0].handle, 1);
> > +     results = gem_mmap__cpu(i915, obj[0].handle, 0, 4096, PROT_READ);
> > +     gem_set_domain(i915, obj[0].handle, I915_GEM_DOMAIN_CPU, 0);
> > +
> > +     obj[1].handle = gem_create(i915, 4096);
> > +     memset(reloc,0, sizeof(reloc));
> > +     obj[1].relocation_count = ARRAY_SIZE(reloc);
> > +     obj[1].relocs_ptr = to_user_pointer(reloc);
> > +     bbe = gem_mmap__wc(i915, obj[1].handle, 0, 4096, PROT_WRITE);
> > +     gem_set_domain(i915, obj[1].handle,
> > +                    I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
> > +
> > +     cs = bbe;
> > +     *cs++ = 0x5 << 23;
> 
> /* ARB_CHK */
> or even better MI_ARB_CHK;

Who doesn't know these commands?

> 
> > +     *cs++ = 0x24 << 23 | 2; /* SRM */
> > +     *cs++ = mmio_base + 0x358; /* TIMESTAMP */
> > +     reloc[0].target_handle = obj[0].handle;
> > +     reloc[0].offset = (cs - bbe) * sizeof(*cs);
> > +     *cs++ = 0;
> > +     *cs++ = 0;
> > +     *cs++ = MI_BATCH_BUFFER_START | 1;
> > +     reloc[1].target_handle = obj[1].handle;
> > +     reloc[1].offset = (cs - bbe) * sizeof(*cs);
> > +     *cs++ = 0;
> > +     *cs++ = 0;
> > +
> > +     cs = bbe + 64;
> > +     *cs++ = 0x24 << 23 | 2; /* SRM */
> > +     *cs++ = mmio_base + 0x358; /* TIMESTAMP */
> > +     reloc[2].target_handle = obj[0].handle;
> > +     reloc[2].offset = (cs - bbe) * sizeof(*cs);
> > +     *cs++ = reloc[2].delta = 4;
> > +     *cs++ = 0;
> > +     *cs++ = 0x29 << 23 | 2; /* LRM */
> > +     *cs++ = mmio_base + 0x600; /* GPR0 */
> > +     reloc[3].target_handle = obj[0].handle;
> > +     reloc[3].offset = (cs - bbe) * sizeof(*cs);
> > +     *cs++ = 0;
> > +     *cs++ = 0;
> > +     *cs++ = 0x24 << 23 | 2; /* SRM */
> > +     *cs++ = mmio_base + 0x600; /* GPR0 */
> > +     reloc[4].target_handle = obj[0].handle;
> > +     reloc[4].offset = (cs - bbe) * sizeof(*cs);
> > +     *cs++ = reloc[4].delta = 8;
> > +     *cs++ = 0;
> > +     *cs++ = 0xa << 23;
> > +
> > +     memset(&eb, 0, sizeof(eb));
> > +     eb.buffers_ptr = to_user_pointer(obj);
> > +     eb.buffer_count = ARRAY_SIZE(obj);
> > +     eb.flags = engine;
> > +     eb.flags |= LOCAL_I915_EXEC_NO_RELOC;
> > +
> > +     v = 0;
> > +     igt_mean_init(&mean);
> > +     igt_until_timeout(5) {
> > +             eb.rsvd1 = ctx[0];
> > +             eb.batch_start_offset = 0;
> > +             gem_execbuf(i915, &eb);
> > +
> > +             while (results[0] == v)
> > +                     igt_assert(gem_bo_busy(i915, obj[1].handle));
> 
> Is there a race there? Object will be busy short time
> after the result appears.

No. Object will be busy before results and until a short time after --
unless there is a gpu hang, in which case the results will never appear,
but the object will no longer be busy.

> 
> > +
> > +             eb.rsvd1 = ctx[1];
> > +             eb.batch_start_offset = 64 * sizeof(*cs);
> > +             gem_execbuf(i915, &eb);
> > +
> > +             *bbe = 0xa << 23;
> 
> __sync_syncronize or are you relying on the corresponding
> syscalls?

It is not timing critical, so a lazy wcb flush is fine.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx