Re: [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> · Wed, 5 Apr 2017 17:48:05 +0100

On Wed, Apr 05, 2017 at 05:14:01PM +0100, Tvrtko Ursulin wrote:
> +static void
> +__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno)
> +{
> +	const uint32_t bbe = 0xa << 23;
> +	unsigned long bb_sz = get_bb_sz(&w->duration);
> +	unsigned long mmap_start, cmd_offset, mmap_len;
> +	uint32_t *ptr, *cs;
> +
> +	mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t);
> +	cmd_offset = bb_sz - mmap_len;
> +	mmap_start = rounddown(cmd_offset, PAGE_SIZE);
> +	mmap_len += cmd_offset - mmap_start;
> +
> +	gem_set_domain(fd, w->bb_handle,
> +		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> +
> +	ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> +	cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start);
> +
> +	if (seqnos) {
> +		const int gen = intel_gen(intel_get_drm_devid(fd));
> +
> +		igt_assert(gen >= 8);
> +
> +		w->reloc.offset = bb_sz - 4 * sizeof(uint32_t);
> +		w->seqno_offset = bb_sz - 2 * sizeof(uint32_t);
> +
> +		*cs++ = terminate ? MI_STORE_DWORD_IMM : 0;
> +		*cs++ = 0;
> +		*cs++ = 0;
> +		*cs++ = seqno;
> +	}
> +
> +	*cs = terminate ? bbe : 0;
> +
> +	munmap(ptr, mmap_len);
> +}
> +
> +static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno)
> +{
> +	__emit_bb_end(w, true, seqnos, seqno);
> +}
> +
> +static void unterminate_bb(struct w_step *w, bool seqnos)
> +{
> +	__emit_bb_end(w, false, seqnos, 0);
> +}
> +
> +static void
> +prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos)
> +{
> +	int max_ctx = -1;
> +	struct w_step *w;
> +	int i;
> +
> +	if (seqnos) {
> +		const unsigned int status_sz = sizeof(uint32_t);
> +
> +		for (i = 0; i < NUM_ENGINES; i++) {
> +			wrk->status_page_handle[i] = gem_create(fd, status_sz);

Need to set_cache_level(CACHED) for llc.

You can use one page for all engines. Just use a different cacheline
for each, for safety.

> +			wrk->status_page[i] =
> +				gem_mmap__cpu(fd, wrk->status_page_handle[i],
> +					      0, status_sz, PROT_READ);
> +		}
> +	}
> +
> +	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +		if ((int)w->context > max_ctx) {
> +			int delta = w->context + 1 - wrk->nr_ctxs;
> +
> +			wrk->nr_ctxs += delta;
> +			wrk->ctx_id = realloc(wrk->ctx_id,
> +					      wrk->nr_ctxs * sizeof(uint32_t));
> +			memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
> +			       delta * sizeof(uint32_t));
> +
> +			max_ctx = w->context;
> +		}
> +
> +		if (!wrk->ctx_id[w->context]) {
> +			struct drm_i915_gem_context_create arg = {};
> +
> +			drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
> +			igt_assert(arg.ctx_id);
> +
> +			wrk->ctx_id[w->context] = arg.ctx_id;
> +		}
> +	}
> +
> +	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +		enum intel_engine_id engine = w->engine;
> +		unsigned int bb_i, j = 0;
> +
> +		if (w->type != BATCH)
> +			continue;
> +
> +		w->obj[j].handle = gem_create(fd, 4096);
> +		w->obj[j].flags = EXEC_OBJECT_WRITE;
> +		j++;
> +
> +		if (seqnos) {
> +			w->obj[j].handle = wrk->status_page_handle[engine];
> +			w->obj[j].flags = EXEC_OBJECT_WRITE;

The trick for sharing between engines is to not mark this as a WRITE.
Fun little lies.

> +			j++;
> +		}
> +
> +		bb_i = j++;
> +		w->duration.cur = w->duration.max;
> +		w->bb_sz = get_bb_sz(&w->duration);
> +		w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
> +		terminate_bb(w, seqnos, 0);
> +		if (seqnos) {
> +			w->reloc.presumed_offset = -1;
> +			w->reloc.target_handle = 1;
> +			w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
> +			w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION;

Ugh. That's a magic w/a value for pipecontrols. Fortunately we don't want
to set write_domain here anyway.

> +		}
> +
> +		igt_assert(w->dependency <= 0);
> +		if (w->dependency) {
> +			int dep_idx = i + w->dependency;
> +
> +			igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
> +			igt_assert(wrk->steps[dep_idx].type == BATCH);
> +
> +			w->obj[j].handle = w->obj[bb_i].handle;
> +			bb_i = j;
> +			w->obj[j - 1].handle =
> +					wrk->steps[dep_idx].obj[0].handle;
> +			j++;
> +		}
> +
> +		if (seqnos) {
> +			w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
> +			w->obj[bb_i].relocation_count = 1;
> +		}
> +
> +		w->eb.buffers_ptr = to_user_pointer(w->obj);
> +		w->eb.buffer_count = j;
> +		w->eb.rsvd1 = wrk->ctx_id[w->context];
> +
> +		if (swap_vcs && engine == VCS1)
> +			engine = VCS2;
> +		else if (swap_vcs && engine == VCS2)
> +			engine = VCS1;
> +		w->eb.flags = eb_engine_map[engine];
> +		w->eb.flags |= I915_EXEC_HANDLE_LUT;
> +		if (!seqnos)
> +			w->eb.flags |= I915_EXEC_NO_RELOC;

Doesn't look too hard to get the relocation right. Forcing relocations
between batches is probably a good one to check (just to say don't do
that)

> +#ifdef DEBUG
> +		printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
> +		       i, w->eb.buffer_count, w->obj[0].handle,
> +		       w->obj[1].handle, w->obj[2].handle, w->obj[3].handle,
> +		       w->bb_sz, w->eb.flags, w->bb_handle, bb_i,
> +		       w->context, wrk->ctx_id[w->context]);
> +#endif
> +	}
> +}
> +
> +static double elapsed(const struct timespec *start, const struct timespec *end)
> +{
> +	return (end->tv_sec - start->tv_sec) +
> +	       (end->tv_nsec - start->tv_nsec) / 1e9;
> +}
> +
> +static int elapsed_us(const struct timespec *start, const struct timespec *end)
> +{

return 1e6 * elapsed(); might as well use gcc for something!

> +	return (1e9 * (end->tv_sec - start->tv_sec) +
> +	       (end->tv_nsec - start->tv_nsec)) / 1e3;
> +}
> +
> +static enum intel_engine_id
> +rr_balance(struct workload *wrk, struct w_step *w)
> +{
> +	unsigned int engine;
> +
> +	if (wrk->vcs_rr)
> +		engine = VCS2;
> +	else
> +		engine = VCS1;
> +
> +	wrk->vcs_rr ^= 1;
> +
> +	return engine;
> +}
> +
> +static enum intel_engine_id
> +qd_balance(struct workload *wrk, struct w_step *w)
> +{
> +	unsigned long qd[NUM_ENGINES];
> +	enum intel_engine_id engine = w->engine;
> +
> +	igt_assert(engine == VCS);
> +
> +	qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[VCS1][0];
> +	wrk->qd_sum[VCS1] += qd[VCS1];
> +
> +	qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[VCS2][0];
> +	wrk->qd_sum[VCS2] += qd[VCS2];
> +
> +	if (qd[VCS1] < qd[VCS2]) {
> +		engine = VCS1;
> +		wrk->vcs_rr = 0;
> +	} else if (qd[VCS2] < qd[VCS1]) {
> +		engine = VCS2;
> +		wrk->vcs_rr = 1;
> +	} else {
> +		unsigned int vcs = wrk->vcs_rr ^ 1;
> +
> +		wrk->vcs_rr = vcs;
> +
> +		if (vcs == 0)
> +			engine = VCS1;
> +		else
> +			engine = VCS2;
> +	}

Hmm. Just thinking we don't even need hw to simulate a load-balancer,
but that would be boring!

> +// printf("qd_balance: 1:%lu 2:%lu rr:%u = %u\n", qd[VCS1], qd[VCS2], wrk->vcs_rr, engine);
> +
> +	return engine;
> +}
> +
> +static void update_bb_seqno(struct w_step *w, uint32_t seqno)
> +{
> +	unsigned long mmap_start, mmap_offset, mmap_len;
> +	void *ptr;
> +
> +	mmap_start = rounddown(w->seqno_offset, PAGE_SIZE);
> +	mmap_offset = w->seqno_offset - mmap_start;
> +	mmap_len = sizeof(uint32_t) + mmap_offset;
> +
> +	gem_set_domain(fd, w->bb_handle,
> +		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> +
> +	ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> +
> +	*(uint32_t *)((char *)ptr + mmap_offset) = seqno;

Uh oh. I hope this isn't called inside any loop. Note this is
unsynchronized to the gpu so I wonder what this is for.

> +
> +	munmap(ptr, mmap_len);
> +}
> +
> +static void
> +run_workload(unsigned int id, struct workload *wrk, unsigned int repeat,
> +	     enum intel_engine_id (*balance)(struct workload *wrk,
> +					     struct w_step *w), bool seqnos)
> +{
> +	struct timespec t_start, t_end;
> +	struct w_step *w;
> +	double t;
> +	int i, j;
> +
> +	clock_gettime(CLOCK_MONOTONIC, &t_start);
> +
> +	srand(t_start.tv_nsec);
> +
> +	for (j = 0; j < repeat; j++) {
> +		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +			enum intel_engine_id engine = w->engine;
> +			uint32_t seqno;
> +			bool seqno_updated = false;
> +			int do_sleep = 0;
> +
> +			if (i == 0)
> +				clock_gettime(CLOCK_MONOTONIC,
> +					      &wrk->repeat_start);
> +
> +			if (w->type == DELAY) {
> +				do_sleep = w->wait;
> +			} else if (w->type == PERIOD) {
> +				struct timespec now;
> +
> +				clock_gettime(CLOCK_MONOTONIC, &now);
> +				do_sleep = w->wait -
> +					   elapsed_us(&wrk->repeat_start, &now);
> +				if (do_sleep < 0) {
> +					if (!quiet) {
> +						printf("%u: Dropped period @ %u/%u (%dus late)!\n",
> +						       id, j, i, do_sleep);
> +						continue;
> +					}
> +				}
> +			} else if (w->type == SYNC) {
> +				unsigned int s_idx = i + w->wait;
> +
> +				igt_assert(i > 0 && i < wrk->nr_steps);
> +				igt_assert(wrk->steps[s_idx].type == BATCH);
> +				gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
> +				continue;
> +			}
> +
> +			if (do_sleep) {
> +				usleep(do_sleep);
> +				continue;
> +			}
> +
> +			wrk->nr_bb[engine]++;
> +
> +			if (engine == VCS && balance) {
> +				engine = balance(wrk, w);
> +				wrk->nr_bb[engine]++;
> +
> +				w->obj[1].handle = wrk->status_page_handle[engine];
> +
> +				w->eb.flags = eb_engine_map[engine];
> +				w->eb.flags |= I915_EXEC_HANDLE_LUT;
> +			}
> +
> +			seqno = ++wrk->seqno[engine];
> +
> +			if (w->duration.min != w->duration.max) {
> +				unsigned int cur = get_duration(&w->duration);
> +
> +				if (cur != w->duration.cur) {
> +					unterminate_bb(w, seqnos);

Ah, you said this was for adjusting runlength of the batches. I suggest
using batch_start_offset to change the number of nops rather than
rewrite the batch.

I need to study this a bit more...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx