Re: [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator

Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx> · Thu, 6 Apr 2017 09:18:36 +0100

On 05/04/2017 17:48, Chris Wilson wrote:
On Wed, Apr 05, 2017 at 05:14:01PM +0100, Tvrtko Ursulin wrote:
+static void
+__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno)
+{
+	const uint32_t bbe = 0xa << 23;
+	unsigned long bb_sz = get_bb_sz(&w->duration);
+	unsigned long mmap_start, cmd_offset, mmap_len;
+	uint32_t *ptr, *cs;
+
+	mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t);
+	cmd_offset = bb_sz - mmap_len;
+	mmap_start = rounddown(cmd_offset, PAGE_SIZE);
+	mmap_len += cmd_offset - mmap_start;
+
+	gem_set_domain(fd, w->bb_handle,
+		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
+
+	ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
+	cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start);
+
+	if (seqnos) {
+		const int gen = intel_gen(intel_get_drm_devid(fd));
+
+		igt_assert(gen >= 8);
+
+		w->reloc.offset = bb_sz - 4 * sizeof(uint32_t);
+		w->seqno_offset = bb_sz - 2 * sizeof(uint32_t);
+
+		*cs++ = terminate ? MI_STORE_DWORD_IMM : 0;
+		*cs++ = 0;
+		*cs++ = 0;
+		*cs++ = seqno;
+	}
+
+	*cs = terminate ? bbe : 0;
+
+	munmap(ptr, mmap_len);
+}
+
+static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno)
+{
+	__emit_bb_end(w, true, seqnos, seqno);
+}
+
+static void unterminate_bb(struct w_step *w, bool seqnos)
+{
+	__emit_bb_end(w, false, seqnos, 0);
+}
+
+static void
+prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos)
+{
+	int max_ctx = -1;
+	struct w_step *w;
+	int i;
+
+	if (seqnos) {
+		const unsigned int status_sz = sizeof(uint32_t);
+
+		for (i = 0; i < NUM_ENGINES; i++) {
+			wrk->status_page_handle[i] = gem_create(fd, status_sz);

Need to set_cache_level(CACHED) for llc.

You can use one page for all engines. Just use a different cacheline
for each, for safety.

+			wrk->status_page[i] =
+				gem_mmap__cpu(fd, wrk->status_page_handle[i],
+					      0, status_sz, PROT_READ);
+		}
+	}
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		if ((int)w->context > max_ctx) {
+			int delta = w->context + 1 - wrk->nr_ctxs;
+
+			wrk->nr_ctxs += delta;
+			wrk->ctx_id = realloc(wrk->ctx_id,
+					      wrk->nr_ctxs * sizeof(uint32_t));
+			memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
+			       delta * sizeof(uint32_t));
+
+			max_ctx = w->context;
+		}
+
+		if (!wrk->ctx_id[w->context]) {
+			struct drm_i915_gem_context_create arg = {};
+
+			drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
+			igt_assert(arg.ctx_id);
+
+			wrk->ctx_id[w->context] = arg.ctx_id;
+		}
+	}
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		enum intel_engine_id engine = w->engine;
+		unsigned int bb_i, j = 0;
+
+		if (w->type != BATCH)
+			continue;
+
+		w->obj[j].handle = gem_create(fd, 4096);
+		w->obj[j].flags = EXEC_OBJECT_WRITE;
+		j++;
+
+		if (seqnos) {
+			w->obj[j].handle = wrk->status_page_handle[engine];
+			w->obj[j].flags = EXEC_OBJECT_WRITE;

The trick for sharing between engines is to not mark this as a WRITE.
Fun little lies.

Yeah thats why I have per-engine objects. Which I don't mind since it is 
not like they are wasting any resources compared to everything else. But 
not admitting the write sounds still interesting. What would the 
repercussions of that be - limit us to llc platforms or something?

+			j++;
+		}
+
+		bb_i = j++;
+		w->duration.cur = w->duration.max;
+		w->bb_sz = get_bb_sz(&w->duration);
+		w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
+		terminate_bb(w, seqnos, 0);
+		if (seqnos) {
+			w->reloc.presumed_offset = -1;
+			w->reloc.target_handle = 1;
+			w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
+			w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION;

Ugh. That's a magic w/a value for pipecontrols. Fortunately we don't want
to set write_domain here anyway.

I think I copy-pasted this from another IGT. So you say cheat here as 
well and set zero for both domains?

+		}
+
+		igt_assert(w->dependency <= 0);
+		if (w->dependency) {
+			int dep_idx = i + w->dependency;
+
+			igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
+			igt_assert(wrk->steps[dep_idx].type == BATCH);
+
+			w->obj[j].handle = w->obj[bb_i].handle;
+			bb_i = j;
+			w->obj[j - 1].handle =
+					wrk->steps[dep_idx].obj[0].handle;
+			j++;
+		}
+
+		if (seqnos) {
+			w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
+			w->obj[bb_i].relocation_count = 1;
+		}
+
+		w->eb.buffers_ptr = to_user_pointer(w->obj);
+		w->eb.buffer_count = j;
+		w->eb.rsvd1 = wrk->ctx_id[w->context];
+
+		if (swap_vcs && engine == VCS1)
+			engine = VCS2;
+		else if (swap_vcs && engine == VCS2)
+			engine = VCS1;
+		w->eb.flags = eb_engine_map[engine];
+		w->eb.flags |= I915_EXEC_HANDLE_LUT;
+		if (!seqnos)
+			w->eb.flags |= I915_EXEC_NO_RELOC;

Doesn't look too hard to get the relocation right. Forcing relocations
between batches is probably a good one to check (just to say don't do
that)

I am not following here? You are saying don't do relocations at all? How 
do I make sure things stay fixed and even how to find out where they are 
in the first pass?

+#ifdef DEBUG
+		printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
+		       i, w->eb.buffer_count, w->obj[0].handle,
+		       w->obj[1].handle, w->obj[2].handle, w->obj[3].handle,
+		       w->bb_sz, w->eb.flags, w->bb_handle, bb_i,
+		       w->context, wrk->ctx_id[w->context]);
+#endif
+	}
+}
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+	return (end->tv_sec - start->tv_sec) +
+	       (end->tv_nsec - start->tv_nsec) / 1e9;
+}
+
+static int elapsed_us(const struct timespec *start, const struct timespec *end)
+{

return 1e6 * elapsed(); might as well use gcc for something!

+	return (1e9 * (end->tv_sec - start->tv_sec) +
+	       (end->tv_nsec - start->tv_nsec)) / 1e3;
+}
+
+static enum intel_engine_id
+rr_balance(struct workload *wrk, struct w_step *w)
+{
+	unsigned int engine;
+
+	if (wrk->vcs_rr)
+		engine = VCS2;
+	else
+		engine = VCS1;
+
+	wrk->vcs_rr ^= 1;
+
+	return engine;
+}
+
+static enum intel_engine_id
+qd_balance(struct workload *wrk, struct w_step *w)
+{
+	unsigned long qd[NUM_ENGINES];
+	enum intel_engine_id engine = w->engine;
+
+	igt_assert(engine == VCS);
+
+	qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[VCS1][0];
+	wrk->qd_sum[VCS1] += qd[VCS1];
+
+	qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[VCS2][0];
+	wrk->qd_sum[VCS2] += qd[VCS2];
+
+	if (qd[VCS1] < qd[VCS2]) {
+		engine = VCS1;
+		wrk->vcs_rr = 0;
+	} else if (qd[VCS2] < qd[VCS1]) {
+		engine = VCS2;
+		wrk->vcs_rr = 1;
+	} else {
+		unsigned int vcs = wrk->vcs_rr ^ 1;
+
+		wrk->vcs_rr = vcs;
+
+		if (vcs == 0)
+			engine = VCS1;
+		else
+			engine = VCS2;
+	}

Hmm. Just thinking we don't even need hw to simulate a load-balancer,
but that would be boring!

Definitely not as exciting. :) Perhaps there will be other benefits from 
this tool than the load balancing, but we'll see.

+// printf("qd_balance: 1:%lu 2:%lu rr:%u = %u\n", qd[VCS1], qd[VCS2], wrk->vcs_rr, engine);
+
+	return engine;
+}
+
+static void update_bb_seqno(struct w_step *w, uint32_t seqno)
+{
+	unsigned long mmap_start, mmap_offset, mmap_len;
+	void *ptr;
+
+	mmap_start = rounddown(w->seqno_offset, PAGE_SIZE);
+	mmap_offset = w->seqno_offset - mmap_start;
+	mmap_len = sizeof(uint32_t) + mmap_offset;
+
+	gem_set_domain(fd, w->bb_handle,
+		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
+
+	ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
+
+	*(uint32_t *)((char *)ptr + mmap_offset) = seqno;

Uh oh. I hope this isn't called inside any loop. Note this is
unsynchronized to the gpu so I wonder what this is for.

To update the seqno inside the store_dword_imm. It is called every time 
before a batch is executed so I was thinking whether a gem_sync should 
be preceding it. But then I was thinking it is problematic in general if 
we queue up multiple same batches before they get executed. :( Sounds 
like I would need a separate batch for every iteration for this to work 
correctly. But that sounds too costly. So I don't know at the moment.

+
+	munmap(ptr, mmap_len);
+}
+
+static void
+run_workload(unsigned int id, struct workload *wrk, unsigned int repeat,
+	     enum intel_engine_id (*balance)(struct workload *wrk,
+					     struct w_step *w), bool seqnos)
+{
+	struct timespec t_start, t_end;
+	struct w_step *w;
+	double t;
+	int i, j;
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+	srand(t_start.tv_nsec);
+
+	for (j = 0; j < repeat; j++) {
+		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+			enum intel_engine_id engine = w->engine;
+			uint32_t seqno;
+			bool seqno_updated = false;
+			int do_sleep = 0;
+
+			if (i == 0)
+				clock_gettime(CLOCK_MONOTONIC,
+					      &wrk->repeat_start);
+
+			if (w->type == DELAY) {
+				do_sleep = w->wait;
+			} else if (w->type == PERIOD) {
+				struct timespec now;
+
+				clock_gettime(CLOCK_MONOTONIC, &now);
+				do_sleep = w->wait -
+					   elapsed_us(&wrk->repeat_start, &now);
+				if (do_sleep < 0) {
+					if (!quiet) {
+						printf("%u: Dropped period @ %u/%u (%dus late)!\n",
+						       id, j, i, do_sleep);
+						continue;
+					}
+				}
+			} else if (w->type == SYNC) {
+				unsigned int s_idx = i + w->wait;
+
+				igt_assert(i > 0 && i < wrk->nr_steps);
+				igt_assert(wrk->steps[s_idx].type == BATCH);
+				gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
+				continue;
+			}
+
+			if (do_sleep) {
+				usleep(do_sleep);
+				continue;
+			}
+
+			wrk->nr_bb[engine]++;
+
+			if (engine == VCS && balance) {
+				engine = balance(wrk, w);
+				wrk->nr_bb[engine]++;
+
+				w->obj[1].handle = wrk->status_page_handle[engine];
+
+				w->eb.flags = eb_engine_map[engine];
+				w->eb.flags |= I915_EXEC_HANDLE_LUT;
+			}
+
+			seqno = ++wrk->seqno[engine];
+
+			if (w->duration.min != w->duration.max) {
+				unsigned int cur = get_duration(&w->duration);
+
+				if (cur != w->duration.cur) {
+					unterminate_bb(w, seqnos);

Ah, you said this was for adjusting runlength of the batches. I suggest
using batch_start_offset to change the number of nops rather than
rewrite the batch.

Yeah thanks for that suggestion, I did not think of that.

I need to study this a bit more...

Yes please, especially the bit about how to get accurate seqnos written 
out in each step without needing separate execbuf batches.

I've heard recursive batches mentioned in the past so maybe each 
iteration could have it's own small batch which would jump to the 
nop/delay one (shared between all iterations) and write the unique 
seqno. No idea if that is possible/supported at the moment - I'll go and 
dig a bit.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx