Quoting Tvrtko Ursulin (2019-05-08 13:10:54) > From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > > For simulating frame split workloads it is useful to express a batch which > ends at the same time as the parallel submission on the respective bonded > engine. For this we add support for infinite batch durations and the batch > terminate command ('T'). Syntax looks like this: > > 1.RCS.*.0.0 > T.-1 > > First step starts an infinite batch, and second command terminates the > infinite batch with the usual relative workload step addressing. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > --- > benchmarks/gem_wsim.c | 119 +++++++++++++++++++------ > benchmarks/wsim/README | 9 +- > benchmarks/wsim/frame-split-60fps.wsim | 6 +- > 3 files changed, 102 insertions(+), 32 deletions(-) > > diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c > index cc6f4a742c12..97821b723b02 100644 > --- a/benchmarks/gem_wsim.c > +++ b/benchmarks/gem_wsim.c > @@ -86,6 +86,7 @@ enum w_type > ENGINE_MAP, > LOAD_BALANCE, > BOND, > + TERMINATE, > }; > > struct deps > @@ -113,6 +114,7 @@ struct w_step > unsigned int context; > unsigned int engine; > struct duration duration; > + bool unbound_duration; > struct deps data_deps; > struct deps fence_deps; > int emit_fence; > @@ -143,7 +145,7 @@ struct w_step > > struct drm_i915_gem_execbuffer2 eb; > struct drm_i915_gem_exec_object2 *obj; > - struct drm_i915_gem_relocation_entry reloc[4]; > + struct drm_i915_gem_relocation_entry reloc[5]; > unsigned long bb_sz; > uint32_t bb_handle; > uint32_t *seqno_value; > @@ -153,6 +155,7 @@ struct w_step > uint32_t *rt1_address; > uint32_t *latch_value; > uint32_t *latch_address; > + uint32_t *recursive_bb_start; > }; > > DECLARE_EWMA(uint64_t, rt, 4, 2) > @@ -491,6 +494,10 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w) > > step.type = ENGINE_MAP; > goto add_step; > + } else if (!strcmp(field, "T")) { > + int_field(TERMINATE, target, > + tmp >= 0 || ((int)nr_steps + tmp) < 0, > + "Invalid terminate target at step %u!\n"); > } else if (!strcmp(field, "X")) { > unsigned int nr = 0; > while ((field = strtok_r(fstart, ".", &fctx))) { > @@ -605,23 +612,28 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w) > > fstart = NULL; > > - tmpl = strtol(field, &sep, 10); > - check_arg(tmpl <= 0 || tmpl == LONG_MIN || > - tmpl == LONG_MAX, > - "Invalid duration at step %u!\n", nr_steps); > - step.duration.min = tmpl; > - > - if (sep && *sep == '-') { > - tmpl = strtol(sep + 1, NULL, 10); > - check_arg(tmpl <= 0 || > - tmpl <= step.duration.min || > - tmpl == LONG_MIN || > + if (field[0] == '*') { > + step.unbound_duration = true; > + } else { > + tmpl = strtol(field, &sep, 10); > + check_arg(tmpl <= 0 || tmpl == LONG_MIN || > tmpl == LONG_MAX, > - "Invalid duration range at step %u!\n", > + "Invalid duration at step %u!\n", > nr_steps); > - step.duration.max = tmpl; > - } else { > - step.duration.max = step.duration.min; > + step.duration.min = tmpl; > + > + if (sep && *sep == '-') { > + tmpl = strtol(sep + 1, NULL, 10); > + check_arg(tmpl <= 0 || > + tmpl <= step.duration.min || > + tmpl == LONG_MIN || > + tmpl == LONG_MAX, > + "Invalid duration range at step %u!\n", > + nr_steps); > + step.duration.max = tmpl; > + } else { > + step.duration.max = step.duration.min; > + } > } > > valid++; > @@ -781,7 +793,7 @@ init_bb(struct w_step *w, unsigned int flags) > unsigned int i; > uint32_t *ptr; > > - if (!arb_period) > + if (w->unbound_duration || !arb_period) > return; > > gem_set_domain(fd, w->bb_handle, > @@ -801,6 +813,7 @@ terminate_bb(struct w_step *w, unsigned int flags) > const uint32_t bbe = 0xa << 23; > unsigned long mmap_start, mmap_len; > unsigned long batch_start = w->bb_sz; > + unsigned int r = 0; > uint32_t *ptr, *cs; > > igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT)); > @@ -811,6 +824,9 @@ terminate_bb(struct w_step *w, unsigned int flags) > if (flags & RT) > batch_start -= 12 * sizeof(uint32_t); > > + if (w->unbound_duration) > + batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */ > + > mmap_start = rounddown(batch_start, PAGE_SIZE); > mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE); > > @@ -820,8 +836,19 @@ terminate_bb(struct w_step *w, unsigned int flags) > ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE); > cs = (uint32_t *)((char *)ptr + batch_start - mmap_start); > > + if (w->unbound_duration) { > + w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t); > + batch_start += 4 * sizeof(uint32_t); > + > + *cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP; > + w->recursive_bb_start = cs; > + *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; > + *cs++ = 0; > + *cs++ = 0; Hmm. Have we previously checked for gen >= 8? So preemption check interval is given by batch_start - mmap_start. Which is limited to a max of 64 bytes. That might be a bit excessive on the frequency of doing MI_BB_START, certainly for gen7, gen8+ is a tad more forgiving i.e. it has more bw and doesn't starve the cpu as much. > + } > + > if (flags & SEQNO) { > - w->reloc[0].offset = batch_start + sizeof(uint32_t); > + w->reloc[r++].offset = batch_start + sizeof(uint32_t); > batch_start += 4 * sizeof(uint32_t); > > *cs++ = MI_STORE_DWORD_IMM; > @@ -833,7 +860,7 @@ terminate_bb(struct w_step *w, unsigned int flags) > } > > if (flags & RT) { > - w->reloc[1].offset = batch_start + sizeof(uint32_t); > + w->reloc[r++].offset = batch_start + sizeof(uint32_t); > batch_start += 4 * sizeof(uint32_t); > > *cs++ = MI_STORE_DWORD_IMM; > @@ -843,7 +870,7 @@ terminate_bb(struct w_step *w, unsigned int flags) > w->rt0_value = cs; > *cs++ = 0; > > - w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t); > + w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t); > batch_start += 4 * sizeof(uint32_t); > > *cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */ > @@ -852,7 +879,7 @@ terminate_bb(struct w_step *w, unsigned int flags) > *cs++ = 0; > *cs++ = 0; > > - w->reloc[3].offset = batch_start + sizeof(uint32_t); > + w->reloc[r++].offset = batch_start + sizeof(uint32_t); > batch_start += 4 * sizeof(uint32_t); > > *cs++ = MI_STORE_DWORD_IMM; > @@ -984,19 +1011,28 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags) > } > } > > - w->bb_sz = get_bb_sz(w->duration.max); > - w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz); > + if (w->unbound_duration) > + /* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */ > + w->bb_sz = max(64, get_bb_sz(w->preempt_us)) + > + (1 + 3) * sizeof(uint32_t); > + else > + w->bb_sz = get_bb_sz(w->duration.max); > + w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0)); > init_bb(w, flags); > terminate_bb(w, flags); > > - if (flags & SEQNO) { > + if ((flags & SEQNO) || w->unbound_duration) { > w->obj[j].relocs_ptr = to_user_pointer(&w->reloc); > + if (flags & SEQNO) > + w->obj[j].relocation_count++; > if (flags & RT) > - w->obj[j].relocation_count = 4; > - else > - w->obj[j].relocation_count = 1; > + w->obj[j].relocation_count += 3; > + if (w->unbound_duration) > + w->obj[j].relocation_count++; Huh, I expected to see w->obj[j].relocation_count = r; Already out of scope? > for (i = 0; i < w->obj[j].relocation_count; i++) > w->reloc[i].target_handle = 1; > + if (w->unbound_duration) > + w->reloc[0].target_handle = j; Ok, recursive BB_START. > } > > w->eb.buffers_ptr = to_user_pointer(w->obj); > @@ -2036,6 +2072,18 @@ update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno) > } > } > > +static void > +update_bb_start(struct w_step *w) > +{ > + if (!w->unbound_duration) > + return; > + > + gem_set_domain(fd, w->bb_handle, > + I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC); Hmm. A scary sync point. Do you just want to be sure you have flushed the previous user? > + *w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1; > +} > + > static void w_sync_to(struct workload *wrk, struct w_step *w, int target) > { > if (target < 0) > @@ -2171,9 +2219,13 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine, > if (flags & RT) > update_bb_rt(w, engine, seqno); > > + update_bb_start(w); > + > w->eb.batch_start_offset = > + w->unbound_duration ? > + 0 : > ALIGN(w->bb_sz - get_bb_sz(get_duration(w)), > - 2 * sizeof(uint32_t)); > + 2 * sizeof(uint32_t)); > > for (i = 0; i < w->fence_deps.nr; i++) { > int tgt = w->idx + w->fence_deps.list[i]; > @@ -2313,6 +2365,17 @@ static void *run_workload(void *data) > w->priority; > } > continue; > + } else if (w->type == TERMINATE) { > + unsigned int t_idx = i + w->target; > + > + igt_assert(t_idx >= 0 && t_idx < i); > + igt_assert(wrk->steps[t_idx].type == BATCH); > + igt_assert(wrk->steps[t_idx].unbound_duration); > + > + *wrk->steps[t_idx].recursive_bb_start = > + MI_BATCH_BUFFER_END; > + __sync_synchronize(); > + continue; > } else if (w->type == PREEMPTION || > w->type == ENGINE_MAP || > w->type == LOAD_BALANCE || > diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README > index 6aec718bc812..c94d01018419 100644 > --- a/benchmarks/wsim/README > +++ b/benchmarks/wsim/README > @@ -2,11 +2,11 @@ Workload descriptor format > ========================== > > ctx.engine.duration_us.dependency.wait,... > -<uint>.<str>.<uint>[-<uint>].<int <= 0>[/<int <= 0>][...].<0|1>,... > +<uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,... > B.<uint> > M.<uint>.<str>[|<str>]... > P|X.<uint>.<int> > -d|p|s|t|q|a.<int>,... > +d|p|s|t|q|a|T.<int>,... > b.<uint>.<uint>.<str> > f > > @@ -30,6 +30,7 @@ Additional workload steps are also supported: > 'b' - Set up engine bonds. > 'M' - Set up engine map. > 'P' - Context priority. > + 'T' - Terminate an infinite batch. > 'X' - Context preemption control. > > Engine ids: DEFAULT, RCS, BCS, VCS, VCS1, VCS2, VECS > @@ -77,6 +78,10 @@ Example: > > I this case the last step has a data dependency on both first and second steps. > > +Batch durations can also be specified as infinite by using the '*' in the > +duration field. Such batches must be ended by the terminate command ('T') > +otherwise they will cause a GPU hang to be reported. > + > Sync (fd) fences > ---------------- > > diff --git a/benchmarks/wsim/frame-split-60fps.wsim b/benchmarks/wsim/frame-split-60fps.wsim > index cfbfcd39be7d..ea89da3add48 100644 > --- a/benchmarks/wsim/frame-split-60fps.wsim > +++ b/benchmarks/wsim/frame-split-60fps.wsim > @@ -6,10 +6,12 @@ M.2.VCS2 > B.2 > b.2.1.VCS1 > f > -1.DEFAULT.4000-6000.f-1.0 > +1.DEFAULT.*.f-1.0 > 2.DEFAULT.4000-6000.s-1.0 > a.-3 > -3.RCS.2000-4000.-3/-2.0 > +s.-2 > +T.-4 > +3.RCS.2000-4000.-5/-4.0 > 3.VECS.2000.-1.0 > 4.BCS.1000.-1.0 > s.-2 Usecase looks reasonable. Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx