Re: [igt-dev] [PATCH i-g-t 17/21] gem_wsim: Infinite batch support

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> · Fri, 10 May 2019 14:48:51 +0100

Quoting Tvrtko Ursulin (2019-05-08 13:10:54)
> From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
> 
> For simulating frame split workloads it is useful to express a batch which
> ends at the same time as the parallel submission on the respective bonded
> engine. For this we add support for infinite batch durations and the batch
> terminate command ('T'). Syntax looks like this:
> 
>   1.RCS.*.0.0
>   T.-1
> 
> First step starts an infinite batch, and second command terminates the
> infinite batch with the usual relative workload step addressing.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
> ---
>  benchmarks/gem_wsim.c                  | 119 +++++++++++++++++++------
>  benchmarks/wsim/README                 |   9 +-
>  benchmarks/wsim/frame-split-60fps.wsim |   6 +-
>  3 files changed, 102 insertions(+), 32 deletions(-)
> 
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> index cc6f4a742c12..97821b723b02 100644
> --- a/benchmarks/gem_wsim.c
> +++ b/benchmarks/gem_wsim.c
> @@ -86,6 +86,7 @@ enum w_type
>         ENGINE_MAP,
>         LOAD_BALANCE,
>         BOND,
> +       TERMINATE,
>  };
>  
>  struct deps
> @@ -113,6 +114,7 @@ struct w_step
>         unsigned int context;
>         unsigned int engine;
>         struct duration duration;
> +       bool unbound_duration;
>         struct deps data_deps;
>         struct deps fence_deps;
>         int emit_fence;
> @@ -143,7 +145,7 @@ struct w_step
>  
>         struct drm_i915_gem_execbuffer2 eb;
>         struct drm_i915_gem_exec_object2 *obj;
> -       struct drm_i915_gem_relocation_entry reloc[4];
> +       struct drm_i915_gem_relocation_entry reloc[5];
>         unsigned long bb_sz;
>         uint32_t bb_handle;
>         uint32_t *seqno_value;
> @@ -153,6 +155,7 @@ struct w_step
>         uint32_t *rt1_address;
>         uint32_t *latch_value;
>         uint32_t *latch_address;
> +       uint32_t *recursive_bb_start;
>  };
>  
>  DECLARE_EWMA(uint64_t, rt, 4, 2)
> @@ -491,6 +494,10 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
>  
>                                 step.type = ENGINE_MAP;
>                                 goto add_step;
> +                       } else if (!strcmp(field, "T")) {
> +                               int_field(TERMINATE, target,
> +                                         tmp >= 0 || ((int)nr_steps + tmp) < 0,
> +                                         "Invalid terminate target at step %u!\n");
>                         } else if (!strcmp(field, "X")) {
>                                 unsigned int nr = 0;
>                                 while ((field = strtok_r(fstart, ".", &fctx))) {
> @@ -605,23 +612,28 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
>  
>                         fstart = NULL;
>  
> -                       tmpl = strtol(field, &sep, 10);
> -                       check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
> -                                 tmpl == LONG_MAX,
> -                                 "Invalid duration at step %u!\n", nr_steps);
> -                       step.duration.min = tmpl;
> -
> -                       if (sep && *sep == '-') {
> -                               tmpl = strtol(sep + 1, NULL, 10);
> -                               check_arg(tmpl <= 0 ||
> -                                         tmpl <= step.duration.min ||
> -                                         tmpl == LONG_MIN ||
> +                       if (field[0] == '*') {
> +                               step.unbound_duration = true;
> +                       } else {
> +                               tmpl = strtol(field, &sep, 10);
> +                               check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
>                                           tmpl == LONG_MAX,
> -                                         "Invalid duration range at step %u!\n",
> +                                         "Invalid duration at step %u!\n",
>                                           nr_steps);
> -                               step.duration.max = tmpl;
> -                       } else {
> -                               step.duration.max = step.duration.min;
> +                               step.duration.min = tmpl;
> +
> +                               if (sep && *sep == '-') {
> +                                       tmpl = strtol(sep + 1, NULL, 10);
> +                                       check_arg(tmpl <= 0 ||
> +                                               tmpl <= step.duration.min ||
> +                                               tmpl == LONG_MIN ||
> +                                               tmpl == LONG_MAX,
> +                                               "Invalid duration range at step %u!\n",
> +                                               nr_steps);
> +                                       step.duration.max = tmpl;
> +                               } else {
> +                                       step.duration.max = step.duration.min;
> +                               }
>                         }
>  
>                         valid++;
> @@ -781,7 +793,7 @@ init_bb(struct w_step *w, unsigned int flags)
>         unsigned int i;
>         uint32_t *ptr;
>  
> -       if (!arb_period)
> +       if (w->unbound_duration || !arb_period)
>                 return;
>  
>         gem_set_domain(fd, w->bb_handle,
> @@ -801,6 +813,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>         const uint32_t bbe = 0xa << 23;
>         unsigned long mmap_start, mmap_len;
>         unsigned long batch_start = w->bb_sz;
> +       unsigned int r = 0;
>         uint32_t *ptr, *cs;
>  
>         igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
> @@ -811,6 +824,9 @@ terminate_bb(struct w_step *w, unsigned int flags)
>         if (flags & RT)
>                 batch_start -= 12 * sizeof(uint32_t);
>  
> +       if (w->unbound_duration)
> +               batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
> +
>         mmap_start = rounddown(batch_start, PAGE_SIZE);
>         mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
>  
> @@ -820,8 +836,19 @@ terminate_bb(struct w_step *w, unsigned int flags)
>         ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
>         cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
>  
> +       if (w->unbound_duration) {
> +               w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
> +               batch_start += 4 * sizeof(uint32_t);
> +
> +               *cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
> +               w->recursive_bb_start = cs;
> +               *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
> +               *cs++ = 0;
> +               *cs++ = 0;

Hmm. Have we previously checked for gen >= 8?

So preemption check interval is given by batch_start - mmap_start.
Which is limited to a max of 64 bytes. That might be a bit excessive on
the frequency of doing MI_BB_START, certainly for gen7, gen8+ is a tad
more forgiving i.e. it has more bw and doesn't starve the cpu as much.

> +       }
> +
>         if (flags & SEQNO) {
> -               w->reloc[0].offset = batch_start + sizeof(uint32_t);
> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>                 batch_start += 4 * sizeof(uint32_t);
>  
>                 *cs++ = MI_STORE_DWORD_IMM;
> @@ -833,7 +860,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>         }
>  
>         if (flags & RT) {
> -               w->reloc[1].offset = batch_start + sizeof(uint32_t);
> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>                 batch_start += 4 * sizeof(uint32_t);
>  
>                 *cs++ = MI_STORE_DWORD_IMM;
> @@ -843,7 +870,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>                 w->rt0_value = cs;
>                 *cs++ = 0;
>  
> -               w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);
> +               w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
>                 batch_start += 4 * sizeof(uint32_t);
>  
>                 *cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
> @@ -852,7 +879,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
>                 *cs++ = 0;
>                 *cs++ = 0;
>  
> -               w->reloc[3].offset = batch_start + sizeof(uint32_t);
> +               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
>                 batch_start += 4 * sizeof(uint32_t);
>  
>                 *cs++ = MI_STORE_DWORD_IMM;
> @@ -984,19 +1011,28 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
>                 }
>         }
>  
> -       w->bb_sz = get_bb_sz(w->duration.max);
> -       w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz);
> +       if (w->unbound_duration)
> +               /* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
> +               w->bb_sz = max(64, get_bb_sz(w->preempt_us)) +
> +                          (1 + 3) * sizeof(uint32_t);
> +       else
> +               w->bb_sz = get_bb_sz(w->duration.max);
> +       w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
>         init_bb(w, flags);
>         terminate_bb(w, flags);
>  
> -       if (flags & SEQNO) {
> +       if ((flags & SEQNO) || w->unbound_duration) {
>                 w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
> +               if (flags & SEQNO)
> +                       w->obj[j].relocation_count++;
>                 if (flags & RT)
> -                       w->obj[j].relocation_count = 4;
> -               else
> -                       w->obj[j].relocation_count = 1;
> +                       w->obj[j].relocation_count += 3;
> +               if (w->unbound_duration)
> +                       w->obj[j].relocation_count++;

Huh, I expected to see w->obj[j].relocation_count = r;
Already out of scope?

>                 for (i = 0; i < w->obj[j].relocation_count; i++)
>                         w->reloc[i].target_handle = 1;
> +               if (w->unbound_duration)
> +                       w->reloc[0].target_handle = j;

Ok, recursive BB_START.
>         }
>  
>         w->eb.buffers_ptr = to_user_pointer(w->obj);
> @@ -2036,6 +2072,18 @@ update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
>         }
>  }
>  
> +static void
> +update_bb_start(struct w_step *w)
> +{
> +       if (!w->unbound_duration)
> +               return;
> +
> +       gem_set_domain(fd, w->bb_handle,
> +                      I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);

Hmm. A scary sync point. Do you just want to be sure you have flushed
the previous user?

> +       *w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
> +}
> +
>  static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
>  {
>         if (target < 0)
> @@ -2171,9 +2219,13 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
>         if (flags & RT)
>                 update_bb_rt(w, engine, seqno);
>  
> +       update_bb_start(w);
> +
>         w->eb.batch_start_offset =
> +               w->unbound_duration ?
> +               0 :
>                 ALIGN(w->bb_sz - get_bb_sz(get_duration(w)),
> -                       2 * sizeof(uint32_t));
> +                     2 * sizeof(uint32_t));
>  
>         for (i = 0; i < w->fence_deps.nr; i++) {
>                 int tgt = w->idx + w->fence_deps.list[i];
> @@ -2313,6 +2365,17 @@ static void *run_workload(void *data)
>                                                                     w->priority;
>                                 }
>                                 continue;
> +                       } else if (w->type == TERMINATE) {
> +                               unsigned int t_idx = i + w->target;
> +
> +                               igt_assert(t_idx >= 0 && t_idx < i);
> +                               igt_assert(wrk->steps[t_idx].type == BATCH);
> +                               igt_assert(wrk->steps[t_idx].unbound_duration);
> +
> +                               *wrk->steps[t_idx].recursive_bb_start =
> +                                       MI_BATCH_BUFFER_END;
> +                               __sync_synchronize();
> +                               continue;
>                         } else if (w->type == PREEMPTION ||
>                                    w->type == ENGINE_MAP ||
>                                    w->type == LOAD_BALANCE ||
> diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
> index 6aec718bc812..c94d01018419 100644
> --- a/benchmarks/wsim/README
> +++ b/benchmarks/wsim/README
> @@ -2,11 +2,11 @@ Workload descriptor format
>  ==========================
>  
>  ctx.engine.duration_us.dependency.wait,...
> -<uint>.<str>.<uint>[-<uint>].<int <= 0>[/<int <= 0>][...].<0|1>,...
> +<uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,...
>  B.<uint>
>  M.<uint>.<str>[|<str>]...
>  P|X.<uint>.<int>
> -d|p|s|t|q|a.<int>,...
> +d|p|s|t|q|a|T.<int>,...
>  b.<uint>.<uint>.<str>
>  f
>  
> @@ -30,6 +30,7 @@ Additional workload steps are also supported:
>   'b' - Set up engine bonds.
>   'M' - Set up engine map.
>   'P' - Context priority.
> + 'T' - Terminate an infinite batch.
>   'X' - Context preemption control.
>  
>  Engine ids: DEFAULT, RCS, BCS, VCS, VCS1, VCS2, VECS
> @@ -77,6 +78,10 @@ Example:
>  
>  I this case the last step has a data dependency on both first and second steps.
>  
> +Batch durations can also be specified as infinite by using the '*' in the
> +duration field. Such batches must be ended by the terminate command ('T')
> +otherwise they will cause a GPU hang to be reported.
> +
>  Sync (fd) fences
>  ----------------
>  
> diff --git a/benchmarks/wsim/frame-split-60fps.wsim b/benchmarks/wsim/frame-split-60fps.wsim
> index cfbfcd39be7d..ea89da3add48 100644
> --- a/benchmarks/wsim/frame-split-60fps.wsim
> +++ b/benchmarks/wsim/frame-split-60fps.wsim
> @@ -6,10 +6,12 @@ M.2.VCS2
>  B.2
>  b.2.1.VCS1
>  f
> -1.DEFAULT.4000-6000.f-1.0
> +1.DEFAULT.*.f-1.0
>  2.DEFAULT.4000-6000.s-1.0
>  a.-3
> -3.RCS.2000-4000.-3/-2.0
> +s.-2
> +T.-4
> +3.RCS.2000-4000.-5/-4.0
>  3.VECS.2000.-1.0
>  4.BCS.1000.-1.0
>  s.-2

Usecase looks reasonable.

Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx