On 10/05/2019 14:48, Chris Wilson wrote:
Quoting Tvrtko Ursulin (2019-05-08 13:10:54)From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> For simulating frame split workloads it is useful to express a batch which ends at the same time as the parallel submission on the respective bonded engine. For this we add support for infinite batch durations and the batch terminate command ('T'). Syntax looks like this: 1.RCS.*.0.0 T.-1 First step starts an infinite batch, and second command terminates the infinite batch with the usual relative workload step addressing. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> --- benchmarks/gem_wsim.c | 119 +++++++++++++++++++------ benchmarks/wsim/README | 9 +- benchmarks/wsim/frame-split-60fps.wsim | 6 +- 3 files changed, 102 insertions(+), 32 deletions(-) diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c index cc6f4a742c12..97821b723b02 100644 --- a/benchmarks/gem_wsim.c +++ b/benchmarks/gem_wsim.c @@ -86,6 +86,7 @@ enum w_type ENGINE_MAP, LOAD_BALANCE, BOND, + TERMINATE, };struct deps@@ -113,6 +114,7 @@ struct w_step unsigned int context; unsigned int engine; struct duration duration; + bool unbound_duration; struct deps data_deps; struct deps fence_deps; int emit_fence; @@ -143,7 +145,7 @@ struct w_stepstruct drm_i915_gem_execbuffer2 eb;struct drm_i915_gem_exec_object2 *obj; - struct drm_i915_gem_relocation_entry reloc[4]; + struct drm_i915_gem_relocation_entry reloc[5]; unsigned long bb_sz; uint32_t bb_handle; uint32_t *seqno_value; @@ -153,6 +155,7 @@ struct w_step uint32_t *rt1_address; uint32_t *latch_value; uint32_t *latch_address; + uint32_t *recursive_bb_start; };DECLARE_EWMA(uint64_t, rt, 4, 2)@@ -491,6 +494,10 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)step.type = ENGINE_MAP;goto add_step; + } else if (!strcmp(field, "T")) { + int_field(TERMINATE, target, + tmp >= 0 || ((int)nr_steps + tmp) < 0, + "Invalid terminate target at step %u!\n"); } else if (!strcmp(field, "X")) { unsigned int nr = 0; while ((field = strtok_r(fstart, ".", &fctx))) { @@ -605,23 +612,28 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)fstart = NULL; - tmpl = strtol(field, &sep, 10);- check_arg(tmpl <= 0 || tmpl == LONG_MIN || - tmpl == LONG_MAX, - "Invalid duration at step %u!\n", nr_steps); - step.duration.min = tmpl; - - if (sep && *sep == '-') { - tmpl = strtol(sep + 1, NULL, 10); - check_arg(tmpl <= 0 || - tmpl <= step.duration.min || - tmpl == LONG_MIN || + if (field[0] == '*') { + step.unbound_duration = true; + } else { + tmpl = strtol(field, &sep, 10); + check_arg(tmpl <= 0 || tmpl == LONG_MIN || tmpl == LONG_MAX, - "Invalid duration range at step %u!\n", + "Invalid duration at step %u!\n", nr_steps); - step.duration.max = tmpl; - } else { - step.duration.max = step.duration.min; + step.duration.min = tmpl; + + if (sep && *sep == '-') { + tmpl = strtol(sep + 1, NULL, 10); + check_arg(tmpl <= 0 || + tmpl <= step.duration.min || + tmpl == LONG_MIN || + tmpl == LONG_MAX, + "Invalid duration range at step %u!\n", + nr_steps); + step.duration.max = tmpl; + } else { + step.duration.max = step.duration.min; + } }valid++;@@ -781,7 +793,7 @@ init_bb(struct w_step *w, unsigned int flags) unsigned int i; uint32_t *ptr;- if (!arb_period)+ if (w->unbound_duration || !arb_period) return;gem_set_domain(fd, w->bb_handle,@@ -801,6 +813,7 @@ terminate_bb(struct w_step *w, unsigned int flags) const uint32_t bbe = 0xa << 23; unsigned long mmap_start, mmap_len; unsigned long batch_start = w->bb_sz; + unsigned int r = 0; uint32_t *ptr, *cs;igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));@@ -811,6 +824,9 @@ terminate_bb(struct w_step *w, unsigned int flags) if (flags & RT) batch_start -= 12 * sizeof(uint32_t);+ if (w->unbound_duration)+ batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */ + mmap_start = rounddown(batch_start, PAGE_SIZE); mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);@@ -820,8 +836,19 @@ terminate_bb(struct w_step *w, unsigned int flags)ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE); cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);+ if (w->unbound_duration) {+ w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t); + batch_start += 4 * sizeof(uint32_t); + + *cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP; + w->recursive_bb_start = cs; + *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; + *cs++ = 0; + *cs++ = 0;Hmm. Have we previously checked for gen >= 8?
No, will add.
So preemption check interval is given by batch_start - mmap_start. Which is limited to a max of 64 bytes. That might be a bit excessive on the frequency of doing MI_BB_START, certainly for gen7, gen8+ is a tad more forgiving i.e. it has more bw and doesn't starve the cpu as much.
Nope, mmap_start is not controlling the batch buffer at all. It is just to find the calculated batch_start given that mmap() was given a round-down PAGE_ALIGN start address. Actual preemption check interval is one MI_NOOP. /o\ How much would you recommend to be safe?
+ } + if (flags & SEQNO) { - w->reloc[0].offset = batch_start + sizeof(uint32_t); + w->reloc[r++].offset = batch_start + sizeof(uint32_t); batch_start += 4 * sizeof(uint32_t);*cs++ = MI_STORE_DWORD_IMM;@@ -833,7 +860,7 @@ terminate_bb(struct w_step *w, unsigned int flags) }if (flags & RT) {- w->reloc[1].offset = batch_start + sizeof(uint32_t); + w->reloc[r++].offset = batch_start + sizeof(uint32_t); batch_start += 4 * sizeof(uint32_t);*cs++ = MI_STORE_DWORD_IMM;@@ -843,7 +870,7 @@ terminate_bb(struct w_step *w, unsigned int flags) w->rt0_value = cs; *cs++ = 0;- w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);+ w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t); batch_start += 4 * sizeof(uint32_t);*cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */@@ -852,7 +879,7 @@ terminate_bb(struct w_step *w, unsigned int flags) *cs++ = 0; *cs++ = 0;- w->reloc[3].offset = batch_start + sizeof(uint32_t);+ w->reloc[r++].offset = batch_start + sizeof(uint32_t); batch_start += 4 * sizeof(uint32_t);*cs++ = MI_STORE_DWORD_IMM;@@ -984,19 +1011,28 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags) } }- w->bb_sz = get_bb_sz(w->duration.max);- w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz); + if (w->unbound_duration) + /* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */ + w->bb_sz = max(64, get_bb_sz(w->preempt_us)) + + (1 + 3) * sizeof(uint32_t); + else + w->bb_sz = get_bb_sz(w->duration.max); + w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0)); init_bb(w, flags); terminate_bb(w, flags);- if (flags & SEQNO) {+ if ((flags & SEQNO) || w->unbound_duration) { w->obj[j].relocs_ptr = to_user_pointer(&w->reloc); + if (flags & SEQNO) + w->obj[j].relocation_count++; if (flags & RT) - w->obj[j].relocation_count = 4; - else - w->obj[j].relocation_count = 1; + w->obj[j].relocation_count += 3; + if (w->unbound_duration) + w->obj[j].relocation_count++;Huh, I expected to see w->obj[j].relocation_count = r; Already out of scope?
In a helper yes. Under danger that I got confused about what's what, I think I could make the helper return the count.
for (i = 0; i < w->obj[j].relocation_count; i++) w->reloc[i].target_handle = 1; + if (w->unbound_duration) + w->reloc[0].target_handle = j;Ok, recursive BB_START.}w->eb.buffers_ptr = to_user_pointer(w->obj);@@ -2036,6 +2072,18 @@ update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno) } }+static void+update_bb_start(struct w_step *w) +{ + if (!w->unbound_duration) + return; + + gem_set_domain(fd, w->bb_handle, + I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);Hmm. A scary sync point. Do you just want to be sure you have flushed the previous user?
Yes. By definition one infinite batch runs max once per frame. If it has been terminated the code needs to re-instate the BB_START, so I thought I need to be sure it is not executing before I do that. I guess if someone forgot to terminate it this would hang on second loop. But I think that's better than just carrying on with a potentially no-op instead of infinite batch.
+ *w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1; +} + static void w_sync_to(struct workload *wrk, struct w_step *w, int target) { if (target < 0) @@ -2171,9 +2219,13 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine, if (flags & RT) update_bb_rt(w, engine, seqno);+ update_bb_start(w);+ w->eb.batch_start_offset = + w->unbound_duration ? + 0 : ALIGN(w->bb_sz - get_bb_sz(get_duration(w)), - 2 * sizeof(uint32_t)); + 2 * sizeof(uint32_t));for (i = 0; i < w->fence_deps.nr; i++) {int tgt = w->idx + w->fence_deps.list[i]; @@ -2313,6 +2365,17 @@ static void *run_workload(void *data) w->priority; } continue; + } else if (w->type == TERMINATE) { + unsigned int t_idx = i + w->target; + + igt_assert(t_idx >= 0 && t_idx < i); + igt_assert(wrk->steps[t_idx].type == BATCH); + igt_assert(wrk->steps[t_idx].unbound_duration); + + *wrk->steps[t_idx].recursive_bb_start = + MI_BATCH_BUFFER_END; + __sync_synchronize(); + continue; } else if (w->type == PREEMPTION || w->type == ENGINE_MAP || w->type == LOAD_BALANCE || diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README index 6aec718bc812..c94d01018419 100644 --- a/benchmarks/wsim/README +++ b/benchmarks/wsim/README @@ -2,11 +2,11 @@ Workload descriptor format ==========================ctx.engine.duration_us.dependency.wait,...-<uint>.<str>.<uint>[-<uint>].<int <= 0>[/<int <= 0>][...].<0|1>,... +<uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,... B.<uint> M.<uint>.<str>[|<str>]... P|X.<uint>.<int> -d|p|s|t|q|a.<int>,... +d|p|s|t|q|a|T.<int>,... b.<uint>.<uint>.<str> f@@ -30,6 +30,7 @@ Additional workload steps are also supported:'b' - Set up engine bonds. 'M' - Set up engine map. 'P' - Context priority. + 'T' - Terminate an infinite batch. 'X' - Context preemption control.Engine ids: DEFAULT, RCS, BCS, VCS, VCS1, VCS2, VECS@@ -77,6 +78,10 @@ Example:I this case the last step has a data dependency on both first and second steps. +Batch durations can also be specified as infinite by using the '*' in the+duration field. Such batches must be ended by the terminate command ('T') +otherwise they will cause a GPU hang to be reported. + Sync (fd) fences ----------------diff --git a/benchmarks/wsim/frame-split-60fps.wsim b/benchmarks/wsim/frame-split-60fps.wsimindex cfbfcd39be7d..ea89da3add48 100644 --- a/benchmarks/wsim/frame-split-60fps.wsim +++ b/benchmarks/wsim/frame-split-60fps.wsim @@ -6,10 +6,12 @@ M.2.VCS2 B.2 b.2.1.VCS1 f -1.DEFAULT.4000-6000.f-1.0 +1.DEFAULT.*.f-1.0 2.DEFAULT.4000-6000.s-1.0 a.-3 -3.RCS.2000-4000.-3/-2.0 +s.-2 +T.-4 +3.RCS.2000-4000.-5/-4.0 3.VECS.2000.-1.0 4.BCS.1000.-1.0 s.-2Usecase looks reasonable. Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Thanks, Tvrtko _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx