Re: [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 05/04/2017 17:48, Chris Wilson wrote:
On Wed, Apr 05, 2017 at 05:14:01PM +0100, Tvrtko Ursulin wrote:
+static void
+__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno)
+{
+	const uint32_t bbe = 0xa << 23;
+	unsigned long bb_sz = get_bb_sz(&w->duration);
+	unsigned long mmap_start, cmd_offset, mmap_len;
+	uint32_t *ptr, *cs;
+
+	mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t);
+	cmd_offset = bb_sz - mmap_len;
+	mmap_start = rounddown(cmd_offset, PAGE_SIZE);
+	mmap_len += cmd_offset - mmap_start;
+
+	gem_set_domain(fd, w->bb_handle,
+		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
+
+	ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
+	cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start);
+
+	if (seqnos) {
+		const int gen = intel_gen(intel_get_drm_devid(fd));
+
+		igt_assert(gen >= 8);
+
+		w->reloc.offset = bb_sz - 4 * sizeof(uint32_t);
+		w->seqno_offset = bb_sz - 2 * sizeof(uint32_t);
+
+		*cs++ = terminate ? MI_STORE_DWORD_IMM : 0;
+		*cs++ = 0;
+		*cs++ = 0;
+		*cs++ = seqno;
+	}
+
+	*cs = terminate ? bbe : 0;
+
+	munmap(ptr, mmap_len);
+}
+
+static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno)
+{
+	__emit_bb_end(w, true, seqnos, seqno);
+}
+
+static void unterminate_bb(struct w_step *w, bool seqnos)
+{
+	__emit_bb_end(w, false, seqnos, 0);
+}
+
+static void
+prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos)
+{
+	int max_ctx = -1;
+	struct w_step *w;
+	int i;
+
+	if (seqnos) {
+		const unsigned int status_sz = sizeof(uint32_t);
+
+		for (i = 0; i < NUM_ENGINES; i++) {
+			wrk->status_page_handle[i] = gem_create(fd, status_sz);

Need to set_cache_level(CACHED) for llc.

You can use one page for all engines. Just use a different cacheline
for each, for safety.

+			wrk->status_page[i] =
+				gem_mmap__cpu(fd, wrk->status_page_handle[i],
+					      0, status_sz, PROT_READ);
+		}
+	}
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		if ((int)w->context > max_ctx) {
+			int delta = w->context + 1 - wrk->nr_ctxs;
+
+			wrk->nr_ctxs += delta;
+			wrk->ctx_id = realloc(wrk->ctx_id,
+					      wrk->nr_ctxs * sizeof(uint32_t));
+			memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
+			       delta * sizeof(uint32_t));
+
+			max_ctx = w->context;
+		}
+
+		if (!wrk->ctx_id[w->context]) {
+			struct drm_i915_gem_context_create arg = {};
+
+			drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
+			igt_assert(arg.ctx_id);
+
+			wrk->ctx_id[w->context] = arg.ctx_id;
+		}
+	}
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		enum intel_engine_id engine = w->engine;
+		unsigned int bb_i, j = 0;
+
+		if (w->type != BATCH)
+			continue;
+
+		w->obj[j].handle = gem_create(fd, 4096);
+		w->obj[j].flags = EXEC_OBJECT_WRITE;
+		j++;
+
+		if (seqnos) {
+			w->obj[j].handle = wrk->status_page_handle[engine];
+			w->obj[j].flags = EXEC_OBJECT_WRITE;

The trick for sharing between engines is to not mark this as a WRITE.
Fun little lies.

Yeah thats why I have per-engine objects. Which I don't mind since it is not like they are wasting any resources compared to everything else. But not admitting the write sounds still interesting. What would the repercussions of that be - limit us to llc platforms or something?

+			j++;
+		}
+
+		bb_i = j++;
+		w->duration.cur = w->duration.max;
+		w->bb_sz = get_bb_sz(&w->duration);
+		w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
+		terminate_bb(w, seqnos, 0);
+		if (seqnos) {
+			w->reloc.presumed_offset = -1;
+			w->reloc.target_handle = 1;
+			w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
+			w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION;

Ugh. That's a magic w/a value for pipecontrols. Fortunately we don't want
to set write_domain here anyway.

I think I copy-pasted this from another IGT. So you say cheat here as well and set zero for both domains?


+		}
+
+		igt_assert(w->dependency <= 0);
+		if (w->dependency) {
+			int dep_idx = i + w->dependency;
+
+			igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
+			igt_assert(wrk->steps[dep_idx].type == BATCH);
+
+			w->obj[j].handle = w->obj[bb_i].handle;
+			bb_i = j;
+			w->obj[j - 1].handle =
+					wrk->steps[dep_idx].obj[0].handle;
+			j++;
+		}
+
+		if (seqnos) {
+			w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
+			w->obj[bb_i].relocation_count = 1;
+		}
+
+		w->eb.buffers_ptr = to_user_pointer(w->obj);
+		w->eb.buffer_count = j;
+		w->eb.rsvd1 = wrk->ctx_id[w->context];
+
+		if (swap_vcs && engine == VCS1)
+			engine = VCS2;
+		else if (swap_vcs && engine == VCS2)
+			engine = VCS1;
+		w->eb.flags = eb_engine_map[engine];
+		w->eb.flags |= I915_EXEC_HANDLE_LUT;
+		if (!seqnos)
+			w->eb.flags |= I915_EXEC_NO_RELOC;

Doesn't look too hard to get the relocation right. Forcing relocations
between batches is probably a good one to check (just to say don't do
that)

I am not following here? You are saying don't do relocations at all? How do I make sure things stay fixed and even how to find out where they are in the first pass?

+#ifdef DEBUG
+		printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
+		       i, w->eb.buffer_count, w->obj[0].handle,
+		       w->obj[1].handle, w->obj[2].handle, w->obj[3].handle,
+		       w->bb_sz, w->eb.flags, w->bb_handle, bb_i,
+		       w->context, wrk->ctx_id[w->context]);
+#endif
+	}
+}
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+	return (end->tv_sec - start->tv_sec) +
+	       (end->tv_nsec - start->tv_nsec) / 1e9;
+}
+
+static int elapsed_us(const struct timespec *start, const struct timespec *end)
+{

return 1e6 * elapsed(); might as well use gcc for something!

+	return (1e9 * (end->tv_sec - start->tv_sec) +
+	       (end->tv_nsec - start->tv_nsec)) / 1e3;
+}
+
+static enum intel_engine_id
+rr_balance(struct workload *wrk, struct w_step *w)
+{
+	unsigned int engine;
+
+	if (wrk->vcs_rr)
+		engine = VCS2;
+	else
+		engine = VCS1;
+
+	wrk->vcs_rr ^= 1;
+
+	return engine;
+}
+
+static enum intel_engine_id
+qd_balance(struct workload *wrk, struct w_step *w)
+{
+	unsigned long qd[NUM_ENGINES];
+	enum intel_engine_id engine = w->engine;
+
+	igt_assert(engine == VCS);
+
+	qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[VCS1][0];
+	wrk->qd_sum[VCS1] += qd[VCS1];
+
+	qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[VCS2][0];
+	wrk->qd_sum[VCS2] += qd[VCS2];
+
+	if (qd[VCS1] < qd[VCS2]) {
+		engine = VCS1;
+		wrk->vcs_rr = 0;
+	} else if (qd[VCS2] < qd[VCS1]) {
+		engine = VCS2;
+		wrk->vcs_rr = 1;
+	} else {
+		unsigned int vcs = wrk->vcs_rr ^ 1;
+
+		wrk->vcs_rr = vcs;
+
+		if (vcs == 0)
+			engine = VCS1;
+		else
+			engine = VCS2;
+	}

Hmm. Just thinking we don't even need hw to simulate a load-balancer,
but that would be boring!

Definitely not as exciting. :) Perhaps there will be other benefits from this tool than the load balancing, but we'll see.

+// printf("qd_balance: 1:%lu 2:%lu rr:%u = %u\n", qd[VCS1], qd[VCS2], wrk->vcs_rr, engine);
+
+	return engine;
+}
+
+static void update_bb_seqno(struct w_step *w, uint32_t seqno)
+{
+	unsigned long mmap_start, mmap_offset, mmap_len;
+	void *ptr;
+
+	mmap_start = rounddown(w->seqno_offset, PAGE_SIZE);
+	mmap_offset = w->seqno_offset - mmap_start;
+	mmap_len = sizeof(uint32_t) + mmap_offset;
+
+	gem_set_domain(fd, w->bb_handle,
+		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
+
+	ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
+
+	*(uint32_t *)((char *)ptr + mmap_offset) = seqno;

Uh oh. I hope this isn't called inside any loop. Note this is
unsynchronized to the gpu so I wonder what this is for.

To update the seqno inside the store_dword_imm. It is called every time before a batch is executed so I was thinking whether a gem_sync should be preceding it. But then I was thinking it is problematic in general if we queue up multiple same batches before they get executed. :( Sounds like I would need a separate batch for every iteration for this to work correctly. But that sounds too costly. So I don't know at the moment.

+
+	munmap(ptr, mmap_len);
+}
+
+static void
+run_workload(unsigned int id, struct workload *wrk, unsigned int repeat,
+	     enum intel_engine_id (*balance)(struct workload *wrk,
+					     struct w_step *w), bool seqnos)
+{
+	struct timespec t_start, t_end;
+	struct w_step *w;
+	double t;
+	int i, j;
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+	srand(t_start.tv_nsec);
+
+	for (j = 0; j < repeat; j++) {
+		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+			enum intel_engine_id engine = w->engine;
+			uint32_t seqno;
+			bool seqno_updated = false;
+			int do_sleep = 0;
+
+			if (i == 0)
+				clock_gettime(CLOCK_MONOTONIC,
+					      &wrk->repeat_start);
+
+			if (w->type == DELAY) {
+				do_sleep = w->wait;
+			} else if (w->type == PERIOD) {
+				struct timespec now;
+
+				clock_gettime(CLOCK_MONOTONIC, &now);
+				do_sleep = w->wait -
+					   elapsed_us(&wrk->repeat_start, &now);
+				if (do_sleep < 0) {
+					if (!quiet) {
+						printf("%u: Dropped period @ %u/%u (%dus late)!\n",
+						       id, j, i, do_sleep);
+						continue;
+					}
+				}
+			} else if (w->type == SYNC) {
+				unsigned int s_idx = i + w->wait;
+
+				igt_assert(i > 0 && i < wrk->nr_steps);
+				igt_assert(wrk->steps[s_idx].type == BATCH);
+				gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
+				continue;
+			}
+
+			if (do_sleep) {
+				usleep(do_sleep);
+				continue;
+			}
+
+			wrk->nr_bb[engine]++;
+
+			if (engine == VCS && balance) {
+				engine = balance(wrk, w);
+				wrk->nr_bb[engine]++;
+
+				w->obj[1].handle = wrk->status_page_handle[engine];
+
+				w->eb.flags = eb_engine_map[engine];
+				w->eb.flags |= I915_EXEC_HANDLE_LUT;
+			}
+
+			seqno = ++wrk->seqno[engine];
+
+			if (w->duration.min != w->duration.max) {
+				unsigned int cur = get_duration(&w->duration);
+
+				if (cur != w->duration.cur) {
+					unterminate_bb(w, seqnos);

Ah, you said this was for adjusting runlength of the batches. I suggest
using batch_start_offset to change the number of nops rather than
rewrite the batch.

Yeah thanks for that suggestion, I did not think of that.

I need to study this a bit more...

Yes please, especially the bit about how to get accurate seqnos written out in each step without needing separate execbuf batches.

I've heard recursive batches mentioned in the past so maybe each iteration could have it's own small batch which would jump to the nop/delay one (shared between all iterations) and write the unique seqno. No idea if that is possible/supported at the moment - I'll go and dig a bit.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux