[PATCH i-g-t 18/27] gem_wsim: Infinite batch support

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Mon, 20 May 2019 15:47:30 +0100

From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>

For simulating frame split workloads it is useful to express a batch which
ends at the same time as the parallel submission on the respective bonded
engine. For this we add support for infinite batch durations and the batch
terminate command ('T'). Syntax looks like this:

  1.RCS.*.0.0
  T.-1

First step starts an infinite batch, and second command terminates the
infinite batch with the usual relative workload step addressing.

v2: (Chris)
 * Relax the recursive batch with 4096 nops between BB_START.
 * Check for at least gen8.
 * Simplify relocation entry building.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> # v1
---
 benchmarks/gem_wsim.c                  | 124 ++++++++++++++++++-------
 benchmarks/wsim/README                 |   9 +-
 benchmarks/wsim/frame-split-60fps.wsim |   6 +-
 3 files changed, 104 insertions(+), 35 deletions(-)

diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index 6fb6c8cef2c7..47a1ddaf8792 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -86,6 +86,7 @@ enum w_type
 	ENGINE_MAP,
 	LOAD_BALANCE,
 	BOND,
+	TERMINATE,
 };
 
 struct deps
@@ -113,6 +114,7 @@ struct w_step
 	unsigned int context;
 	unsigned int engine;
 	struct duration duration;
+	bool unbound_duration;
 	struct deps data_deps;
 	struct deps fence_deps;
 	int emit_fence;
@@ -143,7 +145,7 @@ struct w_step
 
 	struct drm_i915_gem_execbuffer2 eb;
 	struct drm_i915_gem_exec_object2 *obj;
-	struct drm_i915_gem_relocation_entry reloc[4];
+	struct drm_i915_gem_relocation_entry reloc[5];
 	unsigned long bb_sz;
 	uint32_t bb_handle;
 	uint32_t *seqno_value;
@@ -153,6 +155,7 @@ struct w_step
 	uint32_t *rt1_address;
 	uint32_t *latch_value;
 	uint32_t *latch_address;
+	uint32_t *recursive_bb_start;
 };
 
 DECLARE_EWMA(uint64_t, rt, 4, 2)
@@ -517,6 +520,10 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
 
 				step.type = ENGINE_MAP;
 				goto add_step;
+			} else if (!strcmp(field, "T")) {
+				int_field(TERMINATE, target,
+					  tmp >= 0 || ((int)nr_steps + tmp) < 0,
+					  "Invalid terminate target at step %u!\n");
 			} else if (!strcmp(field, "X")) {
 				unsigned int nr = 0;
 				while ((field = strtok_r(fstart, ".", &fctx))) {
@@ -632,23 +639,31 @@ parse_workload(struct w_arg *arg, unsigned int flags, struct workload *app_w)
 
 			fstart = NULL;
 
-			tmpl = strtol(field, &sep, 10);
-			check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
-				  tmpl == LONG_MAX,
-				  "Invalid duration at step %u!\n", nr_steps);
-			step.duration.min = tmpl;
-
-			if (sep && *sep == '-') {
-				tmpl = strtol(sep + 1, NULL, 10);
-				check_arg(tmpl <= 0 ||
-					  tmpl <= step.duration.min ||
-					  tmpl == LONG_MIN ||
-					  tmpl == LONG_MAX,
-					  "Invalid duration range at step %u!\n",
+			if (field[0] == '*') {
+				check_arg(intel_gen(intel_get_drm_devid(fd)) < 8,
+					  "Infinite batch at step %u needs Gen8+!\n",
 					  nr_steps);
-				step.duration.max = tmpl;
+				step.unbound_duration = true;
 			} else {
-				step.duration.max = step.duration.min;
+				tmpl = strtol(field, &sep, 10);
+				check_arg(tmpl <= 0 || tmpl == LONG_MIN ||
+					  tmpl == LONG_MAX,
+					  "Invalid duration at step %u!\n",
+					  nr_steps);
+				step.duration.min = tmpl;
+
+				if (sep && *sep == '-') {
+					tmpl = strtol(sep + 1, NULL, 10);
+					check_arg(tmpl <= 0 ||
+						tmpl <= step.duration.min ||
+						tmpl == LONG_MIN ||
+						tmpl == LONG_MAX,
+						"Invalid duration range at step %u!\n",
+						nr_steps);
+					step.duration.max = tmpl;
+				} else {
+					step.duration.max = step.duration.min;
+				}
 			}
 
 			valid++;
@@ -808,7 +823,7 @@ init_bb(struct w_step *w, unsigned int flags)
 	unsigned int i;
 	uint32_t *ptr;
 
-	if (!arb_period)
+	if (w->unbound_duration || !arb_period)
 		return;
 
 	gem_set_domain(fd, w->bb_handle,
@@ -822,12 +837,13 @@ init_bb(struct w_step *w, unsigned int flags)
 	munmap(ptr, mmap_len);
 }
 
-static void
+static unsigned int
 terminate_bb(struct w_step *w, unsigned int flags)
 {
 	const uint32_t bbe = 0xa << 23;
 	unsigned long mmap_start, mmap_len;
 	unsigned long batch_start = w->bb_sz;
+	unsigned int r = 0;
 	uint32_t *ptr, *cs;
 
 	igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
@@ -838,6 +854,9 @@ terminate_bb(struct w_step *w, unsigned int flags)
 	if (flags & RT)
 		batch_start -= 12 * sizeof(uint32_t);
 
+	if (w->unbound_duration)
+		batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
+
 	mmap_start = rounddown(batch_start, PAGE_SIZE);
 	mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
 
@@ -847,8 +866,19 @@ terminate_bb(struct w_step *w, unsigned int flags)
 	ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
 	cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
 
+	if (w->unbound_duration) {
+		w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
+		batch_start += 4 * sizeof(uint32_t);
+
+		*cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
+		w->recursive_bb_start = cs;
+		*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
+		*cs++ = 0;
+		*cs++ = 0;
+	}
+
 	if (flags & SEQNO) {
-		w->reloc[0].offset = batch_start + sizeof(uint32_t);
+		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
 		batch_start += 4 * sizeof(uint32_t);
 
 		*cs++ = MI_STORE_DWORD_IMM;
@@ -860,7 +890,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
 	}
 
 	if (flags & RT) {
-		w->reloc[1].offset = batch_start + sizeof(uint32_t);
+		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
 		batch_start += 4 * sizeof(uint32_t);
 
 		*cs++ = MI_STORE_DWORD_IMM;
@@ -870,7 +900,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
 		w->rt0_value = cs;
 		*cs++ = 0;
 
-		w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);
+		w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
 		batch_start += 4 * sizeof(uint32_t);
 
 		*cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
@@ -879,7 +909,7 @@ terminate_bb(struct w_step *w, unsigned int flags)
 		*cs++ = 0;
 		*cs++ = 0;
 
-		w->reloc[3].offset = batch_start + sizeof(uint32_t);
+		w->reloc[r++].offset = batch_start + sizeof(uint32_t);
 		batch_start += 4 * sizeof(uint32_t);
 
 		*cs++ = MI_STORE_DWORD_IMM;
@@ -891,6 +921,8 @@ terminate_bb(struct w_step *w, unsigned int flags)
 	}
 
 	*cs = bbe;
+
+	return r;
 }
 
 static const unsigned int eb_engine_map[NUM_ENGINES] = {
@@ -1011,19 +1043,22 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
 		}
 	}
 
-	w->bb_sz = get_bb_sz(w->duration.max);
-	w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz);
+	if (w->unbound_duration)
+		/* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
+		w->bb_sz = max(PAGE_SIZE, get_bb_sz(w->preempt_us)) +
+			   (1 + 3) * sizeof(uint32_t);
+	else
+		w->bb_sz = get_bb_sz(w->duration.max);
+	w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
 	init_bb(w, flags);
-	terminate_bb(w, flags);
+	w->obj[j].relocation_count = terminate_bb(w, flags);
 
-	if (flags & SEQNO) {
+	if (w->obj[j].relocation_count) {
 		w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
-		if (flags & RT)
-			w->obj[j].relocation_count = 4;
-		else
-			w->obj[j].relocation_count = 1;
 		for (i = 0; i < w->obj[j].relocation_count; i++)
 			w->reloc[i].target_handle = 1;
+		if (w->unbound_duration)
+			w->reloc[0].target_handle = j;
 	}
 
 	w->eb.buffers_ptr = to_user_pointer(w->obj);
@@ -2113,6 +2148,18 @@ update_bb_rt(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
 	}
 }
 
+static void
+update_bb_start(struct w_step *w)
+{
+	if (!w->unbound_duration)
+		return;
+
+	gem_set_domain(fd, w->bb_handle,
+		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
+
+	*w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
+}
+
 static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
 {
 	if (target < 0)
@@ -2248,9 +2295,13 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
 	if (flags & RT)
 		update_bb_rt(w, engine, seqno);
 
+	update_bb_start(w);
+
 	w->eb.batch_start_offset =
+		w->unbound_duration ?
+		0 :
 		ALIGN(w->bb_sz - get_bb_sz(get_duration(w)),
-			2 * sizeof(uint32_t));
+		      2 * sizeof(uint32_t));
 
 	for (i = 0; i < w->fence_deps.nr; i++) {
 		int tgt = w->idx + w->fence_deps.list[i];
@@ -2390,6 +2441,17 @@ static void *run_workload(void *data)
 								    w->priority;
 				}
 				continue;
+			} else if (w->type == TERMINATE) {
+				unsigned int t_idx = i + w->target;
+
+				igt_assert(t_idx >= 0 && t_idx < i);
+				igt_assert(wrk->steps[t_idx].type == BATCH);
+				igt_assert(wrk->steps[t_idx].unbound_duration);
+
+				*wrk->steps[t_idx].recursive_bb_start =
+					MI_BATCH_BUFFER_END;
+				__sync_synchronize();
+				continue;
 			} else if (w->type == PREEMPTION ||
 				   w->type == ENGINE_MAP ||
 				   w->type == LOAD_BALANCE ||
diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
index c5107326a681..497d5cad2142 100644
--- a/benchmarks/wsim/README
+++ b/benchmarks/wsim/README
@@ -2,11 +2,11 @@ Workload descriptor format
 ==========================
 
 ctx.engine.duration_us.dependency.wait,...
-<uint>.<str>.<uint>[-<uint>].<int <= 0>[/<int <= 0>][...].<0|1>,...
+<uint>.<str>.<uint>[-<uint>]|*.<int <= 0>[/<int <= 0>][...].<0|1>,...
 B.<uint>
 M.<uint>.<str>[|<str>]...
 P|X.<uint>.<int>
-d|p|s|t|q|a.<int>,...
+d|p|s|t|q|a|T.<int>,...
 b.<uint>.<str>[|<str>].<str>
 f
 
@@ -30,6 +30,7 @@ Additional workload steps are also supported:
  'b' - Set up engine bonds.
  'M' - Set up engine map.
  'P' - Context priority.
+ 'T' - Terminate an infinite batch.
  'X' - Context preemption control.
 
 Engine ids: DEFAULT, RCS, BCS, VCS, VCS1, VCS2, VECS
@@ -77,6 +78,10 @@ Example:
 
 I this case the last step has a data dependency on both first and second steps.
 
+Batch durations can also be specified as infinite by using the '*' in the
+duration field. Such batches must be ended by the terminate command ('T')
+otherwise they will cause a GPU hang to be reported.
+
 Sync (fd) fences
 ----------------
 
diff --git a/benchmarks/wsim/frame-split-60fps.wsim b/benchmarks/wsim/frame-split-60fps.wsim
index 20fdcf8c8b4a..17490ddfaddd 100644
--- a/benchmarks/wsim/frame-split-60fps.wsim
+++ b/benchmarks/wsim/frame-split-60fps.wsim
@@ -6,10 +6,12 @@ M.2.VCS2
 B.2
 b.2.VCS2.VCS1
 f
-1.DEFAULT.4000-6000.f-1.0
+1.DEFAULT.*.f-1.0
 2.DEFAULT.4000-6000.s-1.0
 a.-3
-3.RCS.2000-4000.-3/-2.0
+s.-2
+T.-4
+3.RCS.2000-4000.-5/-4.0
 3.VECS.2000.-1.0
 4.BCS.1000.-1.0
 s.-2
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx