[PATCH i-g-t v4] benchmarks/gem_wsim: Command submission workload simulator

Tvrtko Ursulin <tursulin@xxxxxxxxxxx> · Thu, 20 Apr 2017 13:29:11 +0100

From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>

Tool which emits batch buffers to engines with configurable
sequences, durations, contexts, dependencies and userspace waits.

Unfinished but shows promise so sending out for early feedback.

v2:
 * Load workload descriptors from files. (also -w)
 * Help text.
 * Calibration control if needed. (-t)
 * NORELOC | LUT to eb flags.
 * Added sample workload to wsim/workload1.

v3:
 * Multiple parallel different workloads (-w -w ...).
 * Multi-context workloads.
 * Variable (random) batch length.
 * Load balancing (round robin and queue depth estimation).
 * Workloads delays and explicit sync steps.
 * Workload frequency (period) control.

v4:
 * Fixed queue-depth estimation by creating separate batches
   per engine when qd load balancing is on.
 * Dropped separate -s cmd line option. It can turn itself on
   automatically when needed.
 * Keep a single status page and lie about the write hazard
   as suggested by Chris.
 * Use batch_start_offset for controlling the batch duration.
   (Chris)
 * Set status page object cache level. (Chris)
 * Moved workload description to a README.
 * Tidied example workloads.
 * Some other cleanups and refactorings.

TODO list:

 * Fence support.
 * Better error handling.
 * Less 1980's workload parsing.
 * Proper workloads.
 * Threads?
 * ... ?

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@xxxxxxxxx>
---

Comparing some test workloads under load balancing it seems that it is starting
to work, but it still needs more thorough verification. For example, round-
robin balancing:

# benchmarks/gem_wsim -n 585341 \
		      -w benchmarks/wsim/vcs1.wsim \
		      -w benchmarks/wsim/vcs_balanced.wsim \
		      -r 100 -b 0
Using 585341 nop calibration for 1000us delay.
2 clients.
1: 3.008s elapsed (33.243 workloads/s). 2500 (1250 + 1250) total VCS batches.
0: 4.455s elapsed (22.449 workloads/s). 0 (2500 + 0) total VCS batches.
4.455s elapsed (44.889 workloads/s)


Versus the queue-depth estimation:

# benchmarks/gem_wsim -n 585341 \
		      -w benchmarks/wsim/vcs1.wsim \
		      -w benchmarks/wsim/vcs_balanced.wsim \
		      -r 100 -b 1
Using 585341 nop calibration for 1000us delay.
2 clients.
1: 2.239s elapsed (44.659 workloads/s). 2500 (837 + 1663) total VCS batches. Average queue depths 27.575, 19.285.
0: 4.012s elapsed (24.928 workloads/s). 0 (2500 + 0) total VCS batches. Average queue depths -nan, -nan.
4.012s elapsed (49.845 workloads/s)

In both cases we run two workloads, one which only submits to VCS1 and one which
can be load-balanced. The latter gets a ~33% boost with queue-depth estimation,
and the non-balancing workload ~10%.

---
 benchmarks/Makefile.sources                  |    1 +
 benchmarks/gem_wsim.c                        | 1014 ++++++++++++++++++++++++++
 benchmarks/wsim/README                       |   54 ++
 benchmarks/wsim/media_17i7.wsim              |    7 +
 benchmarks/wsim/media_load_balance_17i7.wsim |    7 +
 benchmarks/wsim/vcs1.wsim                    |   25 +
 benchmarks/wsim/vcs_balanced.wsim            |   25 +
 7 files changed, 1133 insertions(+)
 create mode 100644 benchmarks/gem_wsim.c
 create mode 100644 benchmarks/wsim/README
 create mode 100644 benchmarks/wsim/media_17i7.wsim
 create mode 100644 benchmarks/wsim/media_load_balance_17i7.wsim
 create mode 100644 benchmarks/wsim/vcs1.wsim
 create mode 100644 benchmarks/wsim/vcs_balanced.wsim

diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources
index 3af54ebe36f2..3a941150abb3 100644
--- a/benchmarks/Makefile.sources
+++ b/benchmarks/Makefile.sources
@@ -14,6 +14,7 @@ benchmarks_prog_list =			\
 	gem_prw				\
 	gem_set_domain			\
 	gem_syslatency			\
+	gem_wsim			\
 	kms_vblank			\
 	prime_lookup			\
 	vgem_mmap			\
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
new file mode 100644
index 000000000000..adf2d6decf12
--- /dev/null
+++ b/benchmarks/gem_wsim.c
@@ -0,0 +1,1014 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+
+
+#include "intel_chipset.h"
+#include "drm.h"
+#include "ioctl_wrappers.h"
+#include "drmtest.h"
+#include "intel_io.h"
+
+enum intel_engine_id {
+	RCS,
+	BCS,
+	VCS,
+	VCS1,
+	VCS2,
+	VECS,
+	NUM_ENGINES
+};
+
+struct duration {
+	unsigned int min, max;
+};
+
+enum w_type
+{
+	BATCH,
+	SYNC,
+	DELAY,
+	PERIOD
+};
+
+struct w_step
+{
+	/* Workload step metadata */
+	enum w_type type;
+	unsigned int context;
+	unsigned int engine;
+	struct duration duration;
+	int dependency;
+	int wait;
+
+	/* Implementation details */
+	unsigned int idx;
+
+	struct w_step_eb {
+		struct drm_i915_gem_execbuffer2 eb;
+		struct drm_i915_gem_exec_object2 obj[4];
+		struct drm_i915_gem_relocation_entry reloc;
+		unsigned long bb_sz;
+		uint32_t bb_handle;
+		uint32_t *mapped_batch, *mapped_seqno;
+		unsigned int mapped_len;
+	} b[2]; /* One for each VCS when load balancing */
+};
+
+struct workload
+{
+	unsigned int nr_steps;
+	struct w_step *steps;
+
+	struct timespec repeat_start;
+
+	unsigned int nr_ctxs;
+	uint32_t *ctx_id;
+
+	unsigned long seqno[NUM_ENGINES];
+	uint32_t status_page_handle;
+	uint32_t *status_page;
+	unsigned int vcs_rr;
+
+	unsigned long qd_sum[NUM_ENGINES];
+	unsigned long nr_bb[NUM_ENGINES];
+};
+
+static const unsigned int eb_engine_map[NUM_ENGINES] = {
+	[RCS] = I915_EXEC_RENDER,
+	[BCS] = I915_EXEC_BLT,
+	[VCS] = I915_EXEC_BSD,
+	[VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
+	[VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
+	[VECS] = I915_EXEC_VEBOX
+};
+
+static const unsigned int nop_calibration_us = 1000;
+static unsigned long nop_calibration;
+
+static bool quiet;
+static int fd;
+
+#define SWAPVCS	(1<<0)
+#define SEQNO	(1<<1)
+#define BALANCE	(1<<2)
+
+/*
+ * Workload descriptor:
+ *
+ * ctx.engine.duration.dependency.wait,...
+ * <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
+ *
+ * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
+ *
+ * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1"
+ */
+
+static const char *ring_str_map[NUM_ENGINES] = {
+	[RCS] = "RCS",
+	[BCS] = "BCS",
+	[VCS] = "VCS",
+	[VCS1] = "VCS1",
+	[VCS2] = "VCS2",
+	[VECS] = "VECS",
+};
+
+static struct workload *parse_workload(char *_desc)
+{
+	struct workload *wrk;
+	unsigned int nr_steps = 0;
+	char *desc = strdup(_desc);
+	char *_token, *token, *tctx = NULL, *tstart = desc;
+	char *field, *fctx = NULL, *fstart;
+	struct w_step step, *steps = NULL;
+	unsigned int valid;
+	int tmp;
+
+	while ((_token = strtok_r(tstart, ",", &tctx)) != NULL) {
+		tstart = NULL;
+		token = strdup(_token);
+		fstart = token;
+		valid = 0;
+		memset(&step, 0, sizeof(step));
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			if (!strcasecmp(field, "d")) {
+				if ((field = strtok_r(fstart, ".", &fctx)) !=
+				    NULL) {
+					tmp = atoi(field);
+					if (tmp <= 0) {
+						if (!quiet)
+							fprintf(stderr,
+								"Invalid delay at step %u!\n",
+								nr_steps);
+						return NULL;
+					}
+
+					step.type = DELAY;
+					step.wait = tmp;
+					goto add_step;
+				}
+			} else if (!strcasecmp(field, "p")) {
+				if ((field = strtok_r(fstart, ".", &fctx)) !=
+				    NULL) {
+					tmp = atoi(field);
+					if (tmp <= 0) {
+						if (!quiet)
+							fprintf(stderr,
+								"Invalid period at step %u!\n",
+								nr_steps);
+						return NULL;
+					}
+
+					step.type = PERIOD;
+					step.wait = tmp;
+					goto add_step;
+				}
+			} else if (!strcasecmp(field, "s")) {
+				if ((field = strtok_r(fstart, ".", &fctx)) !=
+				    NULL) {
+					tmp = atoi(field);
+					if (tmp >= 0) {
+						if (!quiet)
+							fprintf(stderr,
+								"Invalid sync target at step %u!\n",
+								nr_steps);
+						return NULL;
+					}
+
+					step.type = SYNC;
+					step.wait = tmp;
+					goto add_step;
+				}
+			}
+
+			tmp = atoi(field);
+			if (tmp < 0) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid ctx id at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.context = tmp;
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			unsigned int i, old_valid = valid;
+
+			fstart = NULL;
+
+			for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) {
+				if (!strcasecmp(field, ring_str_map[i])) {
+					step.engine = i;
+					valid++;
+					break;
+				}
+			}
+
+			if (old_valid == valid) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid engine id at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			char *sep = NULL;
+			long int tmpl;
+
+			fstart = NULL;
+
+			tmpl = strtol(field, &sep, 10);
+			if (tmpl == LONG_MIN || tmpl == LONG_MAX) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid duration at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.duration.min = tmpl;
+
+			if (sep && *sep == '-') {
+				tmpl = strtol(sep + 1, NULL, 10);
+				if (tmpl == LONG_MIN || tmpl == LONG_MAX) {
+					if (!quiet)
+						fprintf(stderr,
+							"Invalid duration range at step %u!\n",
+							nr_steps);
+					return NULL;
+				}
+				step.duration.max = tmpl;
+			} else {
+				step.duration.max = step.duration.min;
+			}
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp > 0) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid forward dependency at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.dependency = tmp;
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp != 0 && tmp != 1) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid wait boolean at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.wait = tmp;
+
+			valid++;
+		}
+
+		if (valid != 5) {
+			if (!quiet)
+				fprintf(stderr, "Invalid record at step %u!\n",
+					nr_steps);
+			return NULL;
+		}
+
+		step.type = BATCH;
+
+add_step:
+		step.idx = nr_steps++;
+		steps = realloc(steps, sizeof(step) * nr_steps);
+		igt_assert(steps);
+
+		memcpy(&steps[nr_steps - 1], &step, sizeof(step));
+
+		free(token);
+	}
+
+	wrk = malloc(sizeof(*wrk));
+	igt_assert(wrk);
+
+	wrk->nr_steps = nr_steps;
+	wrk->steps = steps;
+
+	free(desc);
+
+	return wrk;
+}
+
+static struct workload *
+clone_workload(struct workload *_wrk)
+{
+	struct workload *wrk;
+
+	wrk = malloc(sizeof(*wrk));
+	igt_assert(wrk);
+	memset(wrk, 0, sizeof(*wrk));
+
+	wrk->nr_steps = _wrk->nr_steps;
+	wrk->steps = calloc(wrk->nr_steps, sizeof(struct w_step));
+	igt_assert(wrk->steps);
+
+	memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps);
+
+	return wrk;
+}
+
+#define rounddown(x, y) (x - (x%y))
+#ifndef PAGE_SIZE
+#define PAGE_SIZE (4096)
+#endif
+
+static unsigned int get_duration(struct duration *dur)
+{
+	if (dur->min == dur->max)
+		return dur->min;
+	else
+		return dur->min + rand() % (dur->max + 1 - dur->min);
+}
+
+static unsigned long get_bb_sz(unsigned int duration)
+{
+	return ALIGN(duration * nop_calibration * sizeof(uint32_t) /
+		     nop_calibration_us, sizeof(uint32_t));
+}
+
+static void
+terminate_bb(struct w_step *w, struct w_step_eb *b, enum intel_engine_id engine,
+	     unsigned int flags)
+{
+	const uint32_t bbe = 0xa << 23;
+	unsigned long bb_sz = get_bb_sz(w->duration.max);
+	unsigned long mmap_start, cmd_offset, mmap_len;
+	uint32_t *ptr, *cs;
+
+	mmap_len = 1;
+	if (flags & SEQNO)
+		mmap_len += 4;
+	mmap_len *= sizeof(uint32_t);
+	cmd_offset = bb_sz - mmap_len;
+	mmap_start = rounddown(cmd_offset, PAGE_SIZE);
+	mmap_len += cmd_offset - mmap_start;
+
+	gem_set_domain(fd, b->bb_handle,
+		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
+
+	ptr = gem_mmap__wc(fd, b->bb_handle, mmap_start, mmap_len, PROT_WRITE);
+	cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start);
+
+	if (flags & SEQNO) {
+		b->reloc.offset = bb_sz - 4 * sizeof(uint32_t);
+		b->reloc.delta = (engine - VCS1) * sizeof(uint32_t);
+
+		*cs++ = MI_STORE_DWORD_IMM;
+		*cs++ = 0;
+		*cs++ = 0;
+		b->mapped_seqno = cs;
+		*cs++ = 0;
+	}
+
+	*cs = bbe;
+
+	b->mapped_batch = ptr;
+	b->mapped_len = mmap_len;
+}
+
+static void
+alloc_step_batch(struct workload *wrk, struct w_step *w, struct w_step_eb *b,
+		 enum intel_engine_id engine, unsigned int flags)
+{
+	unsigned int bb_i, j = 0;
+
+	b->obj[j].handle = gem_create(fd, 4096);
+	b->obj[j].flags = EXEC_OBJECT_WRITE;
+	j++;
+
+	if (flags & SEQNO) {
+		b->obj[j].handle = wrk->status_page_handle;
+		j++;
+	}
+
+	bb_i = j++;
+	b->bb_sz = get_bb_sz(w->duration.max);
+	b->bb_handle = b->obj[bb_i].handle = gem_create(fd, b->bb_sz);
+	terminate_bb(w, b, engine, flags);
+
+	igt_assert(w->dependency <= 0);
+	if (w->dependency) {
+		int dep_idx = w->idx + w->dependency;
+
+		igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
+		igt_assert(wrk->steps[dep_idx].type == BATCH);
+
+		b->obj[j].handle = b->obj[bb_i].handle;
+		bb_i = j;
+		b->obj[j - 1].handle = wrk->steps[dep_idx].b[0].obj[0].handle;
+		j++;
+
+		if (wrk->steps[dep_idx].b[1].obj[0].handle) {
+			b->obj[j].handle = b->obj[bb_i].handle;
+			bb_i = j;
+			b->obj[j - 1].handle =
+					wrk->steps[dep_idx].b[1].obj[0].handle;
+			j++;
+		}
+	}
+
+	if (flags & SEQNO) {
+		b->reloc.presumed_offset = -1;
+		b->reloc.target_handle = 1;
+		b->obj[bb_i].relocs_ptr = to_user_pointer(&b->reloc);
+		b->obj[bb_i].relocation_count = 1;
+	}
+
+	b->eb.buffers_ptr = to_user_pointer(b->obj);
+	b->eb.buffer_count = j;
+	b->eb.rsvd1 = wrk->ctx_id[w->context];
+
+	if (flags & SWAPVCS && engine == VCS1)
+		engine = VCS2;
+	else if (flags & SWAPVCS && engine == VCS2)
+		engine = VCS1;
+	b->eb.flags = eb_engine_map[engine];
+	b->eb.flags |= I915_EXEC_HANDLE_LUT;
+	if (!(flags & SEQNO))
+		b->eb.flags |= I915_EXEC_NO_RELOC;
+#ifdef DEBUG
+	printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
+		w->idx, b->eb.buffer_count, b->obj[0].handle,
+		b->obj[1].handle, b->obj[2].handle, b->obj[3].handle,
+		b->bb_sz, b->eb.flags, b->bb_handle, bb_i,
+		w->context, wrk->ctx_id[w->context]);
+#endif
+}
+
+static void
+prepare_workload(struct workload *wrk, unsigned int flags)
+{
+	int max_ctx = -1;
+	struct w_step *w;
+	int i;
+
+	if (flags & SEQNO) {
+		const unsigned int status_sz = sizeof(uint32_t);
+		uint32_t handle = gem_create(fd, status_sz);
+
+		gem_set_caching(fd, handle, I915_CACHING_CACHED);
+		wrk->status_page_handle = handle;
+		wrk->status_page = gem_mmap__cpu(fd, handle, 0, status_sz,
+						 PROT_READ);
+	}
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		if ((int)w->context > max_ctx) {
+			int delta = w->context + 1 - wrk->nr_ctxs;
+
+			wrk->nr_ctxs += delta;
+			wrk->ctx_id = realloc(wrk->ctx_id,
+					      wrk->nr_ctxs * sizeof(uint32_t));
+			memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
+			       delta * sizeof(uint32_t));
+
+			max_ctx = w->context;
+		}
+
+		if (!wrk->ctx_id[w->context]) {
+			struct drm_i915_gem_context_create arg = {};
+
+			drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
+			igt_assert(arg.ctx_id);
+
+			wrk->ctx_id[w->context] = arg.ctx_id;
+		}
+	}
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		unsigned int _flags = flags;
+		enum intel_engine_id engine = w->engine;
+
+		if (w->type != BATCH)
+			continue;
+
+		if (engine != VCS && engine != VCS1 && engine != VCS2)
+			_flags &= ~SEQNO;
+
+		if (engine == VCS)
+			_flags &= ~SWAPVCS;
+
+		if (engine == VCS && flags & BALANCE) {
+			alloc_step_batch(wrk, w, &w->b[0], VCS1, _flags);
+			alloc_step_batch(wrk, w, &w->b[1], VCS2, _flags);
+		} else {
+			alloc_step_batch(wrk, w, &w->b[0], engine, _flags);
+		}
+	}
+}
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+	return (end->tv_sec - start->tv_sec) +
+	       (end->tv_nsec - start->tv_nsec) / 1e9;
+}
+
+static int elapsed_us(const struct timespec *start, const struct timespec *end)
+{
+	return elapsed(start, end) * 1e6;
+}
+
+static enum intel_engine_id get_vcs_engine(unsigned int n)
+{
+	const enum intel_engine_id vcs_engines[2] = { VCS1, VCS2 };
+
+	igt_assert(n < ARRAY_SIZE(vcs_engines));
+
+	return vcs_engines[n];
+}
+
+
+static enum intel_engine_id
+rr_balance(struct workload *wrk, struct w_step *w)
+{
+	unsigned int engine;
+
+	engine = get_vcs_engine(wrk->vcs_rr);
+	wrk->vcs_rr ^= 1;
+
+	return engine;
+}
+
+static enum intel_engine_id
+qd_balance(struct workload *wrk, struct w_step *w)
+{
+	enum intel_engine_id engine = w->engine;
+	long qd[NUM_ENGINES];
+	unsigned int n;
+
+	igt_assert(engine == VCS);
+
+	qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[0];
+	wrk->qd_sum[VCS1] += qd[VCS1];
+
+	qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[1];
+	wrk->qd_sum[VCS2] += qd[VCS2];
+
+	if (qd[VCS1] < qd[VCS2])
+		n = 0;
+	else if (qd[VCS2] < qd[VCS1])
+		n = 1;
+	else
+		n = wrk->vcs_rr;
+
+	engine = get_vcs_engine(n);
+	wrk->vcs_rr = n ^ 1;
+
+#ifdef DEBUG
+	printf("qd_balance: 1:%ld 2:%ld rr:%u = %u\t(%lu - %u) (%lu - %u)\n",
+	       qd[VCS1], qd[VCS2], wrk->vcs_rr, engine,
+	       wrk->seqno[VCS1], wrk->status_page[0],
+	       wrk->seqno[VCS2], wrk->status_page[1]);
+#endif
+	return engine;
+}
+
+static void
+update_bb_seqno(struct w_step_eb *b, enum intel_engine_id engine,
+		uint32_t seqno)
+{
+	*b->mapped_seqno = seqno;
+	b->reloc.delta = (engine - VCS1) * sizeof(uint32_t);
+}
+
+static void
+run_workload(unsigned int id, struct workload *wrk, unsigned int repeat,
+	     enum intel_engine_id (*balance)(struct workload *wrk,
+					     struct w_step *w),
+	     unsigned int flags)
+{
+	struct timespec t_start, t_end;
+	struct w_step *w;
+	double t;
+	int i, j;
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+	srand(t_start.tv_nsec);
+
+	for (j = 0; j < repeat; j++) {
+		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+			enum intel_engine_id engine = w->engine;
+			struct w_step_eb *b = &w->b[0];
+			int do_sleep = 0;
+
+			if (i == 0)
+				clock_gettime(CLOCK_MONOTONIC,
+					      &wrk->repeat_start);
+
+			if (w->type == DELAY) {
+				do_sleep = w->wait;
+			} else if (w->type == PERIOD) {
+				struct timespec now;
+
+				clock_gettime(CLOCK_MONOTONIC, &now);
+				do_sleep = w->wait -
+					   elapsed_us(&wrk->repeat_start, &now);
+				if (do_sleep < 0) {
+					if (!quiet) {
+						printf("%u: Dropped period @ %u/%u (%dus late)!\n",
+						       id, j, i, do_sleep);
+						continue;
+					}
+				}
+			} else if (w->type == SYNC) {
+				unsigned int s_idx = i + w->wait;
+
+				igt_assert(i > 0 && i < wrk->nr_steps);
+				igt_assert(wrk->steps[s_idx].type == BATCH);
+				gem_sync(fd, wrk->steps[s_idx].b[0].obj[0].handle);
+				if (wrk->steps[s_idx].b[1].obj[0].handle)
+					gem_sync(fd, wrk->steps[s_idx].b[1].obj[0].handle);
+				continue;
+			}
+
+			if (do_sleep) {
+				usleep(do_sleep);
+				continue;
+			}
+
+			wrk->nr_bb[engine]++;
+
+			if (engine == VCS && balance) {
+				engine = balance(wrk, w);
+				wrk->nr_bb[engine]++;
+				b = &w->b[engine - VCS1];
+
+				if (flags & SEQNO)
+					update_bb_seqno(b, engine,
+							++wrk->seqno[engine]);
+			}
+
+			if (w->duration.min != w->duration.max) {
+				unsigned int d = get_duration(&w->duration);
+				unsigned long offset;
+
+				offset = ALIGN(b->bb_sz - get_bb_sz(d),
+					       2 * sizeof(uint32_t));
+				b->eb.batch_start_offset = offset;
+			}
+
+			gem_execbuf(fd, &b->eb);
+
+			if (w->wait)
+				gem_sync(fd, b->obj[0].handle);
+		}
+	}
+
+	gem_sync(fd, wrk->steps[wrk->nr_steps - 1].b[0].obj[0].handle);
+	if (wrk->steps[wrk->nr_steps - 1].b[1].obj[0].handle)
+		gem_sync(fd, wrk->steps[wrk->nr_steps - 1].b[1].obj[0].handle);
+
+	clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+	t = elapsed(&t_start, &t_end);
+	if (!quiet && !balance)
+		printf("%u: %.3fs elapsed (%.3f workloads/s)\n", id, t, repeat / t);
+	if (!quiet && balance == rr_balance)
+		printf("%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches.\n",
+		       id, t, repeat / t,
+		       wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2]);
+	if (!quiet && balance == qd_balance)
+		printf("%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches. Average queue depths %.3f, %.3f.\n",
+		       id, t, repeat / t,
+		       wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2],
+		       (double)wrk->qd_sum[VCS1] / wrk->nr_bb[VCS],
+		       (double)wrk->qd_sum[VCS2] / wrk->nr_bb[VCS]);
+}
+
+static void fini_workload(struct workload *wrk)
+{
+	free(wrk->steps);
+	free(wrk);
+}
+
+static unsigned long calibrate_nop(unsigned int tolerance_pct)
+{
+	const uint32_t bbe = 0xa << 23;
+	unsigned int loops = 17;
+	unsigned int usecs = nop_calibration_us;
+	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_execbuffer2 eb =
+		{ .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+	long size, last_size;
+	struct timespec t_0, t_end;
+
+	clock_gettime(CLOCK_MONOTONIC, &t_0);
+
+	size = 256 * 1024;
+	do {
+		struct timespec t_start;
+
+		obj.handle = gem_create(fd, size);
+		gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
+			  sizeof(bbe));
+		gem_execbuf(fd, &eb);
+		gem_sync(fd, obj.handle);
+
+		clock_gettime(CLOCK_MONOTONIC, &t_start);
+		for (int loop = 0; loop < loops; loop++)
+			gem_execbuf(fd, &eb);
+		gem_sync(fd, obj.handle);
+		clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+		gem_close(fd, obj.handle);
+
+		last_size = size;
+		size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs;
+		size = ALIGN(size, sizeof(uint32_t));
+	} while (elapsed(&t_0, &t_end) < 5 ||
+		 abs(size - last_size) > (size * tolerance_pct / 100));
+
+	return size / sizeof(uint32_t);
+}
+
+static void print_help(void)
+{
+	puts(
+"Usage: gem_wsim [OPTIONS]\n"
+"\n"
+"Runs a simulated workload on the GPU.\n"
+"When ran without arguments performs a GPU calibration result of which needs\n"
+"to be provided when running the simulation in subsequent invocations.\n"
+"\n"
+"Options:\n"
+"	-h		This text.\n"
+"	-q		Be quiet - do not output anything to stdout.\n"
+"	-n <n>		Nop calibration value.\n"
+"	-t <n>		Nop calibration tolerance percentage.\n"
+"			Use when there is a difficuly obtaining calibration\n"
+"			with the default settings.\n"
+"	-w <desc|path>	Filename or a workload descriptor.\n"
+"			Can be given multiple times.\n"
+"	-r <n>		How many times to emit the workload.\n"
+"	-c <n>		Fork n clients emitting the workload simultaneously.\n"
+"	-x		Swap VCS1 and VCS2 engines in every other client.\n"
+"	-b <n>		Load balancing to use. (0: rr, 1: qd)\n"
+	);
+}
+
+static char *load_workload_descriptor(char *filename)
+{
+	struct stat sbuf;
+	char *buf;
+	int infd, ret, i;
+	ssize_t len;
+
+	ret = stat(filename, &sbuf);
+	if (ret || !S_ISREG(sbuf.st_mode))
+		return filename;
+
+	igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */
+	buf = malloc(sbuf.st_size);
+	igt_assert(buf);
+
+	infd = open(filename, O_RDONLY);
+	igt_assert(infd >= 0);
+	len = read(infd, buf, sbuf.st_size);
+	igt_assert(len == sbuf.st_size);
+	close(infd);
+
+	for (i = 0; i < len; i++) {
+		if (buf[i] == '\n')
+			buf[i] = ',';
+	}
+
+	len--;
+	while (buf[len] == ',')
+		buf[len--] = 0;
+
+	return buf;
+}
+
+static char **
+add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg)
+{
+	w_args = realloc(w_args, sizeof(char *) * nr_args);
+	igt_assert(w_args);
+	w_args[nr_args - 1] = w_arg;
+
+	return w_args;
+}
+
+int main(int argc, char **argv)
+{
+	unsigned int repeat = 1;
+	unsigned int clients = 1;
+	unsigned int flags = 0;
+	struct timespec t_start, t_end;
+	struct workload **w, **wrk = NULL;
+	unsigned int nr_w_args = 0;
+	char **w_args = NULL;
+	unsigned int tolerance_pct = 1;
+	enum intel_engine_id (*balance)(struct workload *, struct w_step *) = NULL;
+	double t;
+	int i, c;
+
+	fd = drm_open_driver(DRIVER_INTEL);
+
+	while ((c = getopt(argc, argv, "c:n:r:qxw:t:b:h")) != -1) {
+		switch (c) {
+		case 'w':
+			w_args = add_workload_arg(w_args, ++nr_w_args, optarg);
+			break;
+		case 'c':
+			clients = strtol(optarg, NULL, 0);
+			break;
+		case 't':
+			tolerance_pct = strtol(optarg, NULL, 0);
+			break;
+		case 'n':
+			nop_calibration = strtol(optarg, NULL, 0);
+			break;
+		case 'r':
+			repeat = strtol(optarg, NULL, 0);
+			break;
+		case 'q':
+			quiet = true;
+			break;
+		case 'x':
+			flags |= SWAPVCS;
+			break;
+		case 'b':
+			switch (strtol(optarg, NULL, 0)) {
+			case 0:
+				balance = rr_balance;
+				flags |= BALANCE;
+				break;
+			case 1:
+				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
+					   8);
+				balance = qd_balance;
+				flags |= SEQNO | BALANCE;
+				break;
+			default:
+				if (!quiet)
+					fprintf(stderr,
+						"Unknown balancing mode '%s'!\n",
+						optarg);
+				return 1;
+			}
+			break;
+		case 'h':
+			print_help();
+			return 0;
+		default:
+			return 1;
+		}
+	}
+
+	if (!nop_calibration) {
+		if (!quiet)
+			printf("Calibrating nop delay with %u%% tolerance...\n",
+				tolerance_pct);
+		nop_calibration = calibrate_nop(tolerance_pct);
+		if (!quiet)
+			printf("Nop calibration for %uus delay is %lu.\n",
+			       nop_calibration_us, nop_calibration);
+
+		return 0;
+	}
+
+	if (!nr_w_args) {
+		if (!quiet)
+			fprintf(stderr, "No workload descriptor(s)!\n");
+		return 1;
+	}
+
+	if (nr_w_args > 1 && clients > 1) {
+		if (!quiet)
+			fprintf(stderr,
+				"Cloned clients cannot be combined with multiple workloads!\n");
+		return 1;
+	}
+
+	wrk = calloc(nr_w_args, sizeof(*wrk));
+	igt_assert(wrk);
+
+	for (i = 0; i < nr_w_args; i++) {
+		w_args[i] = load_workload_descriptor(w_args[i]);
+		if (!w_args[i]) {
+			if (!quiet)
+				fprintf(stderr,
+					"Failed to load workload descriptor %u!\n",
+					i);
+			return 1;
+		}
+
+		wrk[i] = parse_workload(w_args[i]);
+		if (!wrk[i]) {
+			if (!quiet)
+				fprintf(stderr,
+					"Failed to parse workload %u!\n", i);
+			return 1;
+		}
+	}
+
+	if (!quiet) {
+		printf("Using %lu nop calibration for %uus delay.\n",
+		       nop_calibration, nop_calibration_us);
+		if (nr_w_args > 1)
+			clients = nr_w_args;
+		printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
+		if (flags & SWAPVCS)
+			printf("Swapping VCS rings between clients.\n");
+	}
+
+	w = calloc(clients, sizeof(struct workload *));
+	igt_assert(w);
+
+	for (i = 0; i < clients; i++) {
+		unsigned int flags_ = flags;
+
+		w[i] = clone_workload(wrk[nr_w_args > 1 ? i : 0]);
+
+		if (flags & SWAPVCS && i & 1)
+			flags_ &= ~SWAPVCS;
+
+		prepare_workload(w[i], flags_);
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+	igt_fork(child, clients)
+		run_workload(child, w[child], repeat, balance, flags);
+
+	igt_waitchildren();
+
+	clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+	t = elapsed(&t_start, &t_end);
+	if (!quiet)
+		printf("%.3fs elapsed (%.3f workloads/s)\n",
+		       t, clients * repeat / t);
+
+	for (i = 0; i < clients; i++)
+		fini_workload(w[i]);
+	free(w);
+	for (i = 0; i < nr_w_args; i++)
+		fini_workload(wrk[i]);
+	free(w_args);
+
+	return 0;
+}
diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
new file mode 100644
index 000000000000..b55e620c61c2
--- /dev/null
+++ b/benchmarks/wsim/README
@@ -0,0 +1,54 @@
+Workload descriptor format
+==========================
+
+ctx.engine.duration_us.dependency.wait,...
+<uint>.<str>.<uint>[-<uint>].<int <= 0>.<0|1>,...
+d|p|s.<uiny>,...
+
+For duration a range can be given from which a random value will be picked
+before every submit. Since this and seqno management requires CPU access to
+objects, care needs to be taken in order to ensure the submit queue is deep
+enough these operations do not affect the execution speed unless that is
+desired.
+
+Additional workload steps are also supported:
+
+ 'd' - Adds a delay (in microseconds).
+ 'p' - Adds a delay relative to the start of previous loop so that the each loop
+       starts execution with a given period.
+ 's' - Synchronises the pipeline to a batch relative to the step.
+
+Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
+
+Example (leading spaces must not be present in the actual file):
+----------------------------------------------------------------
+
+  1.VCS1.3000.0.1
+  1.RCS.500-1000.-1.0
+  1.RCS.3700.0.0
+  1.RCS.1000.-2.0
+  1.VCS2.2300.-2.0
+  1.RCS.4700.-1.0
+  1.VCS2.600.-1.1
+  p.16000
+
+The above workload described in human language works like this:
+
+  1.   A batch is sent to the VCS1 engine which will be executing for 3ms on the
+       GPU and userspace will wait until it is finished before proceeding.
+  2-4. Now three batches are sent to RCS with durations of 0.5-1.5ms (random
+       duration range), 3.7ms and 1ms respectively. The first batch has a data
+       dependency on the preceding VCS1 batch, and the last of the group depends
+       on the first from the group.
+  5.   Now a 2.3ms batch is sent to VCS2, with a data dependency on the 3.7ms
+       RCS batch.
+  6.   This is followed by a 4.7ms RCS batch with a data dependency on the 2.3ms
+       VCS2 batch.
+  7.   Then a 0.6ms VCS2 batch is sent depending on the previous RCS one. In the
+       same step the tool is told to wait for the batch completes before
+       proceeding.
+  8.   Finally the tool is told to wait long enough to ensure the next iteration
+       starts 16ms after the previous one has started.
+
+When workload descriptors are provided on the command line, commas must be used
+instead of new lines.
diff --git a/benchmarks/wsim/media_17i7.wsim b/benchmarks/wsim/media_17i7.wsim
new file mode 100644
index 000000000000..5f533d8e168b
--- /dev/null
+++ b/benchmarks/wsim/media_17i7.wsim
@@ -0,0 +1,7 @@
+1.VCS1.3000.0.1
+1.RCS.1000.-1.0
+1.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS2.2300.-2.0
+1.RCS.4700.-1.0
+1.VCS2.600.-1.1
diff --git a/benchmarks/wsim/media_load_balance_17i7.wsim b/benchmarks/wsim/media_load_balance_17i7.wsim
new file mode 100644
index 000000000000..25a692032eae
--- /dev/null
+++ b/benchmarks/wsim/media_load_balance_17i7.wsim
@@ -0,0 +1,7 @@
+1.VCS.3000.0.1
+1.RCS.1000.-1.0
+1.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS.2300.-2.0
+1.RCS.4700.-1.0
+1.VCS.600.-1.1
diff --git a/benchmarks/wsim/vcs1.wsim b/benchmarks/wsim/vcs1.wsim
new file mode 100644
index 000000000000..e1986aadd65c
--- /dev/null
+++ b/benchmarks/wsim/vcs1.wsim
@@ -0,0 +1,25 @@
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
diff --git a/benchmarks/wsim/vcs_balanced.wsim b/benchmarks/wsim/vcs_balanced.wsim
new file mode 100644
index 000000000000..9a4b3d785db1
--- /dev/null
+++ b/benchmarks/wsim/vcs_balanced.wsim
@@ -0,0 +1,25 @@
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx