Re: [igt-dev] [PATCH i-g-t 17/21] gem_wsim: Infinite batch support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 10/05/2019 14:48, Chris Wilson wrote:
Quoting Tvrtko Ursulin (2019-05-08 13:10:54)
From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>

For simulating frame split workloads it is useful to express a batch which
ends at the same time as the parallel submission on the respective bonded
engine. For this we add support for infinite batch durations and the batch
terminate command ('T'). Syntax looks like this:

   1.RCS.*.0.0
   T.-1

First step starts an infinite batch, and second command terminates the
infinite batch with the usual relative workload step addressing.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
  benchmarks/gem_wsim.c                  | 119 +++++++++++++++++++------
  benchmarks/wsim/README                 |   9 +-
  benchmarks/wsim/frame-split-60fps.wsim |   6 +-
  3 files changed, 102 insertions(+), 32 deletions(-)

diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index cc6f4a742c12..97821b723b02 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -86,6 +86,7 @@ enum w_type
         ENGINE_MAP,
         LOAD_BALANCE,
         BOND,
+       TERMINATE,
  };
struct deps
@@ -113,6 +114,7 @@ struct w_step
         unsigned int context;
         unsigned int engine;
         struct duration duration;
+       bool unbound_duration;
         struct deps data_deps;
         struct deps fence_deps;
         int emit_fence;
@@ -143,7 +145,7 @@ struct w_step
struct drm_i915_gem_execbuffer2 eb;
         struct drm_i915_gem_exec_object2 *obj;
-       struct drm_i915_gem_relocation_entry reloc[4];
+       struct drm_i915_gem_relocation_entry reloc[5];
         unsigned long bb_sz;
         uint32_t bb_handle;
         uint32_t *seqno_value;
@@ -153,6 +155,7 @@ struct w_step
         uint32_t *rt1_address;
         uint32_t *latch_value;
         uint32_t *latch_address;
+       uint32_t *recursive_bb_start;
  };
DECLARE_EWMA(uint64_t, rt, 4, 2)
@@ -491,6 +494,10 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
step.type = ENGINE_MAP;
                                 goto add_step;
+                       } else if (!strcmp(field, "T")) {
+                               int_field(TERMINATE, target,
+                                         tmp >= 0 || ((int)nr_steps + tmp) < 0,
+                                         "Invalid terminate target at step %u!\n");
                         } else if (!strcmp(field, "X")) {
                                 unsigned int nr = 0;
                                 while ((field = strtok_r(fstart, ".", &fctx))) {
@@ -605,23 +612,28 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
fstart = NULL; - tmpl = strtol(field, &sep, 10);
-                       check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
-                                 tmpl == LONG_MAX,
-                                 "Invalid duration at step %u!\n", nr_steps);
-                       step.duration.min = tmpl;
-
-                       if (sep && *sep == '-') {
-                               tmpl = strtol(sep + 1, NULL, 10);
-                               check_arg(tmpl <= 0 ||
-                                         tmpl <= step.duration.min ||
-                                         tmpl == LONG_MIN ||
+                       if (field[0] == '*') {
+                               step.unbound_duration = true;
+                       } else {
+                               tmpl = strtol(field, &sep, 10);
+                               check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
                                           tmpl == LONG_MAX,
-                                         "Invalid duration range at step %u!\n",
+                                         "Invalid duration at step %u!\n",
                                           nr_steps);
-                               step.duration.max = tmpl;
-                       } else {
-                               step.duration.max = step.duration.min;
+                               step.duration.min = tmpl;
+
+                               if (sep && *sep == '-') {
+                                       tmpl = strtol(sep + 1, NULL, 10);
+                                       check_arg(tmpl <= 0 ||
+                                               tmpl <= step.duration.min ||
+                                               tmpl == LONG_MIN ||
+                                               tmpl == LONG_MAX,
+                                               "Invalid duration range at step %u!\n",
+                                               nr_steps);
+                                       step.duration.max = tmpl;
+                               } else {
+                                       step.duration.max = step.duration.min;
+                               }
                         }
valid++;
@@ -781,7 +793,7 @@ init_bb(struct w_step *w, unsigned int flags)
         unsigned int i;
         uint32_t *ptr;
- if (!arb_period)
+       if (w->unbound_duration || !arb_period)
                 return;
gem_set_domain(fd, w->bb_handle,
@@ -801,6 +813,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
         const uint32_t bbe = 0xa << 23;
         unsigned long mmap_start, mmap_len;
         unsigned long batch_start = w->bb_sz;
+       unsigned int r = 0;
         uint32_t *ptr, *cs;
igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
@@ -811,6 +824,9 @@ terminate_bb(struct w_step *w, unsigned int flags)
         if (flags & RT)
                 batch_start -= 12 * sizeof(uint32_t);
+ if (w->unbound_duration)
+               batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
+
         mmap_start = rounddown(batch_start, PAGE_SIZE);
         mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
@@ -820,8 +836,19 @@ terminate_bb(struct w_step *w, unsigned int flags)
         ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
         cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
+ if (w->unbound_duration) {
+               w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
+               batch_start += 4 * sizeof(uint32_t);
+
+               *cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
+               w->recursive_bb_start = cs;
+               *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
+               *cs++ = 0;
+               *cs++ = 0;

Hmm. Have we previously checked for gen >= 8?

No, will add.

So preemption check interval is given by batch_start - mmap_start.
Which is limited to a max of 64 bytes. That might be a bit excessive on
the frequency of doing MI_BB_START, certainly for gen7, gen8+ is a tad
more forgiving i.e. it has more bw and doesn't starve the cpu as much.

Nope, mmap_start is not controlling the batch buffer at all. It is just to find the calculated batch_start given that mmap() was given a round-down PAGE_ALIGN start address. Actual preemption check interval is one MI_NOOP. /o\ How much would you recommend to be safe?

+       }
+
         if (flags & SEQNO) {
-               w->reloc[0].offset = batch_start + sizeof(uint32_t);
+               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
                 batch_start += 4 * sizeof(uint32_t);
*cs++ = MI_STORE_DWORD_IMM;
@@ -833,7 +860,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
         }
if (flags & RT) {
-               w->reloc[1].offset = batch_start + sizeof(uint32_t);
+               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
                 batch_start += 4 * sizeof(uint32_t);
*cs++ = MI_STORE_DWORD_IMM;
@@ -843,7 +870,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
                 w->rt0_value = cs;
                 *cs++ = 0;
- w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);
+               w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
                 batch_start += 4 * sizeof(uint32_t);
*cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
@@ -852,7 +879,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
                 *cs++ = 0;
                 *cs++ = 0;
- w->reloc[3].offset = batch_start + sizeof(uint32_t);
+               w->reloc[r++].offset = batch_start + sizeof(uint32_t);
                 batch_start += 4 * sizeof(uint32_t);
*cs++ = MI_STORE_DWORD_IMM;
@@ -984,19 +1011,28 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
                 }
         }
- w->bb_sz = get_bb_sz(w->duration.max);
-       w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz);
+       if (w->unbound_duration)
+               /* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
+               w->bb_sz = max(64, get_bb_sz(w->preempt_us)) +
+                          (1 + 3) * sizeof(uint32_t);
+       else
+               w->bb_sz = get_bb_sz(w->duration.max);
+       w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
         init_bb(w, flags);
         terminate_bb(w, flags);
- if (flags & SEQNO) {
+       if ((flags & SEQNO) || w->unbound_duration) {
                 w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
+               if (flags & SEQNO)
+                       w->obj[j].relocation_count++;
                 if (flags & RT)
-                       w->obj[j].relocation_count = 4;
-               else
-                       w->obj[j].relocation_count = 1;
+                       w->obj[j].relocation_count += 3;
+               if (w->unbound_duration)
+                       w->obj[j].relocation_count++;

Huh, I expected to see w->obj[j].relocation_count = r;
Already out of scope?

In a helper yes. Under danger that I got confused about what's what, I think I could make the helper return the count.


                 for (i = 0; i < w->obj[j].relocation_count; i++)
                         w->reloc[i].target_handle = 1;
+               if (w->unbound_duration)
+                       w->reloc[0].target_handle = j;

Ok, recursive BB_START.
         }
w->eb.buffers_ptr = to_user_pointer(w->obj);
@@ -2036,6 +2072,18 @@ update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
         }
  }
+static void
+update_bb_start(struct w_step *w)
+{
+       if (!w->unbound_duration)
+               return;
+
+       gem_set_domain(fd, w->bb_handle,
+                      I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);

Hmm. A scary sync point. Do you just want to be sure you have flushed
the previous user?

Yes. By definition one infinite batch runs max once per frame. If it has been terminated the code needs to re-instate the BB_START, so I thought I need to be sure it is not executing before I do that. I guess if someone forgot to terminate it this would hang on second loop. But I think that's better than just carrying on with a potentially no-op instead of infinite batch.


+       *w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
+}
+
  static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
  {
         if (target < 0)
@@ -2171,9 +2219,13 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
         if (flags & RT)
                 update_bb_rt(w, engine, seqno);
+ update_bb_start(w);
+
         w->eb.batch_start_offset =
+               w->unbound_duration ?
+               0 :
                 ALIGN(w->bb_sz - get_bb_sz(get_duration(w)),
-                       2 * sizeof(uint32_t));
+                     2 * sizeof(uint32_t));
for (i = 0; i < w->fence_deps.nr; i++) {
                 int tgt = w->idx + w->fence_deps.list[i];
@@ -2313,6 +2365,17 @@ static void *run_workload(void *data)
                                                                     w->priority;
                                 }
                                 continue;
+                       } else if (w->type == TERMINATE) {
+                               unsigned int t_idx = i + w->target;
+
+                               igt_assert(t_idx >= 0 && t_idx < i);
+                               igt_assert(wrk->steps[t_idx].type == BATCH);
+                               igt_assert(wrk->steps[t_idx].unbound_duration);
+
+                               *wrk->steps[t_idx].recursive_bb_start =
+                                       MI_BATCH_BUFFER_END;
+                               __sync_synchronize();
+                               continue;
                         } else if (w->type == PREEMPTION ||
                                    w->type == ENGINE_MAP ||
                                    w->type == LOAD_BALANCE ||
diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
index 6aec718bc812..c94d01018419 100644
--- a/benchmarks/wsim/README
+++ b/benchmarks/wsim/README
@@ -2,11 +2,11 @@ Workload descriptor format
  ==========================
ctx.engine.duration_us.dependency.wait,...
-<uint>.<str>.<uint>[-<uint>].<int <= 0>[/<int <= 0>][...].<0|1>,...
+<uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,...
  B.<uint>
  M.<uint>.<str>[|<str>]...
  P|X.<uint>.<int>
-d|p|s|t|q|a.<int>,...
+d|p|s|t|q|a|T.<int>,...
  b.<uint>.<uint>.<str>
  f
@@ -30,6 +30,7 @@ Additional workload steps are also supported:
   'b' - Set up engine bonds.
   'M' - Set up engine map.
   'P' - Context priority.
+ 'T' - Terminate an infinite batch.
   'X' - Context preemption control.
Engine ids: DEFAULT, RCS, BCS, VCS, VCS1, VCS2, VECS
@@ -77,6 +78,10 @@ Example:
I this case the last step has a data dependency on both first and second steps. +Batch durations can also be specified as infinite by using the '*' in the
+duration field. Such batches must be ended by the terminate command ('T')
+otherwise they will cause a GPU hang to be reported.
+
  Sync (fd) fences
  ----------------
diff --git a/benchmarks/wsim/frame-split-60fps.wsim b/benchmarks/wsim/frame-split-60fps.wsim
index cfbfcd39be7d..ea89da3add48 100644
--- a/benchmarks/wsim/frame-split-60fps.wsim
+++ b/benchmarks/wsim/frame-split-60fps.wsim
@@ -6,10 +6,12 @@ M.2.VCS2
  B.2
  b.2.1.VCS1
  f
-1.DEFAULT.4000-6000.f-1.0
+1.DEFAULT.*.f-1.0
  2.DEFAULT.4000-6000.s-1.0
  a.-3
-3.RCS.2000-4000.-3/-2.0
+s.-2
+T.-4
+3.RCS.2000-4000.-5/-4.0
  3.VECS.2000.-1.0
  4.BCS.1000.-1.0
  s.-2

Usecase looks reasonable.

Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>

Thanks,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux