On Wed, Apr 05, 2017 at 05:14:01PM +0100, Tvrtko Ursulin wrote: > +static void > +__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno) > +{ > + const uint32_t bbe = 0xa << 23; > + unsigned long bb_sz = get_bb_sz(&w->duration); > + unsigned long mmap_start, cmd_offset, mmap_len; > + uint32_t *ptr, *cs; > + > + mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t); > + cmd_offset = bb_sz - mmap_len; > + mmap_start = rounddown(cmd_offset, PAGE_SIZE); > + mmap_len += cmd_offset - mmap_start; > + > + gem_set_domain(fd, w->bb_handle, > + I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU); > + > + ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE); > + cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start); > + > + if (seqnos) { > + const int gen = intel_gen(intel_get_drm_devid(fd)); > + > + igt_assert(gen >= 8); > + > + w->reloc.offset = bb_sz - 4 * sizeof(uint32_t); > + w->seqno_offset = bb_sz - 2 * sizeof(uint32_t); > + > + *cs++ = terminate ? MI_STORE_DWORD_IMM : 0; > + *cs++ = 0; > + *cs++ = 0; > + *cs++ = seqno; > + } > + > + *cs = terminate ? bbe : 0; > + > + munmap(ptr, mmap_len); > +} > + > +static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno) > +{ > + __emit_bb_end(w, true, seqnos, seqno); > +} > + > +static void unterminate_bb(struct w_step *w, bool seqnos) > +{ > + __emit_bb_end(w, false, seqnos, 0); > +} > + > +static void > +prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos) > +{ > + int max_ctx = -1; > + struct w_step *w; > + int i; > + > + if (seqnos) { > + const unsigned int status_sz = sizeof(uint32_t); > + > + for (i = 0; i < NUM_ENGINES; i++) { > + wrk->status_page_handle[i] = gem_create(fd, status_sz); Need to set_cache_level(CACHED) for llc. You can use one page for all engines. Just use a different cacheline for each, for safety. > + wrk->status_page[i] = > + gem_mmap__cpu(fd, wrk->status_page_handle[i], > + 0, status_sz, PROT_READ); > + } > + } > + > + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { > + if ((int)w->context > max_ctx) { > + int delta = w->context + 1 - wrk->nr_ctxs; > + > + wrk->nr_ctxs += delta; > + wrk->ctx_id = realloc(wrk->ctx_id, > + wrk->nr_ctxs * sizeof(uint32_t)); > + memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0, > + delta * sizeof(uint32_t)); > + > + max_ctx = w->context; > + } > + > + if (!wrk->ctx_id[w->context]) { > + struct drm_i915_gem_context_create arg = {}; > + > + drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg); > + igt_assert(arg.ctx_id); > + > + wrk->ctx_id[w->context] = arg.ctx_id; > + } > + } > + > + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { > + enum intel_engine_id engine = w->engine; > + unsigned int bb_i, j = 0; > + > + if (w->type != BATCH) > + continue; > + > + w->obj[j].handle = gem_create(fd, 4096); > + w->obj[j].flags = EXEC_OBJECT_WRITE; > + j++; > + > + if (seqnos) { > + w->obj[j].handle = wrk->status_page_handle[engine]; > + w->obj[j].flags = EXEC_OBJECT_WRITE; The trick for sharing between engines is to not mark this as a WRITE. Fun little lies. > + j++; > + } > + > + bb_i = j++; > + w->duration.cur = w->duration.max; > + w->bb_sz = get_bb_sz(&w->duration); > + w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz); > + terminate_bb(w, seqnos, 0); > + if (seqnos) { > + w->reloc.presumed_offset = -1; > + w->reloc.target_handle = 1; > + w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION; > + w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION; Ugh. That's a magic w/a value for pipecontrols. Fortunately we don't want to set write_domain here anyway. > + } > + > + igt_assert(w->dependency <= 0); > + if (w->dependency) { > + int dep_idx = i + w->dependency; > + > + igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps); > + igt_assert(wrk->steps[dep_idx].type == BATCH); > + > + w->obj[j].handle = w->obj[bb_i].handle; > + bb_i = j; > + w->obj[j - 1].handle = > + wrk->steps[dep_idx].obj[0].handle; > + j++; > + } > + > + if (seqnos) { > + w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc); > + w->obj[bb_i].relocation_count = 1; > + } > + > + w->eb.buffers_ptr = to_user_pointer(w->obj); > + w->eb.buffer_count = j; > + w->eb.rsvd1 = wrk->ctx_id[w->context]; > + > + if (swap_vcs && engine == VCS1) > + engine = VCS2; > + else if (swap_vcs && engine == VCS2) > + engine = VCS1; > + w->eb.flags = eb_engine_map[engine]; > + w->eb.flags |= I915_EXEC_HANDLE_LUT; > + if (!seqnos) > + w->eb.flags |= I915_EXEC_NO_RELOC; Doesn't look too hard to get the relocation right. Forcing relocations between batches is probably a good one to check (just to say don't do that) > +#ifdef DEBUG > + printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n", > + i, w->eb.buffer_count, w->obj[0].handle, > + w->obj[1].handle, w->obj[2].handle, w->obj[3].handle, > + w->bb_sz, w->eb.flags, w->bb_handle, bb_i, > + w->context, wrk->ctx_id[w->context]); > +#endif > + } > +} > + > +static double elapsed(const struct timespec *start, const struct timespec *end) > +{ > + return (end->tv_sec - start->tv_sec) + > + (end->tv_nsec - start->tv_nsec) / 1e9; > +} > + > +static int elapsed_us(const struct timespec *start, const struct timespec *end) > +{ return 1e6 * elapsed(); might as well use gcc for something! > + return (1e9 * (end->tv_sec - start->tv_sec) + > + (end->tv_nsec - start->tv_nsec)) / 1e3; > +} > + > +static enum intel_engine_id > +rr_balance(struct workload *wrk, struct w_step *w) > +{ > + unsigned int engine; > + > + if (wrk->vcs_rr) > + engine = VCS2; > + else > + engine = VCS1; > + > + wrk->vcs_rr ^= 1; > + > + return engine; > +} > + > +static enum intel_engine_id > +qd_balance(struct workload *wrk, struct w_step *w) > +{ > + unsigned long qd[NUM_ENGINES]; > + enum intel_engine_id engine = w->engine; > + > + igt_assert(engine == VCS); > + > + qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[VCS1][0]; > + wrk->qd_sum[VCS1] += qd[VCS1]; > + > + qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[VCS2][0]; > + wrk->qd_sum[VCS2] += qd[VCS2]; > + > + if (qd[VCS1] < qd[VCS2]) { > + engine = VCS1; > + wrk->vcs_rr = 0; > + } else if (qd[VCS2] < qd[VCS1]) { > + engine = VCS2; > + wrk->vcs_rr = 1; > + } else { > + unsigned int vcs = wrk->vcs_rr ^ 1; > + > + wrk->vcs_rr = vcs; > + > + if (vcs == 0) > + engine = VCS1; > + else > + engine = VCS2; > + } Hmm. Just thinking we don't even need hw to simulate a load-balancer, but that would be boring! > +// printf("qd_balance: 1:%lu 2:%lu rr:%u = %u\n", qd[VCS1], qd[VCS2], wrk->vcs_rr, engine); > + > + return engine; > +} > + > +static void update_bb_seqno(struct w_step *w, uint32_t seqno) > +{ > + unsigned long mmap_start, mmap_offset, mmap_len; > + void *ptr; > + > + mmap_start = rounddown(w->seqno_offset, PAGE_SIZE); > + mmap_offset = w->seqno_offset - mmap_start; > + mmap_len = sizeof(uint32_t) + mmap_offset; > + > + gem_set_domain(fd, w->bb_handle, > + I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU); > + > + ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE); > + > + *(uint32_t *)((char *)ptr + mmap_offset) = seqno; Uh oh. I hope this isn't called inside any loop. Note this is unsynchronized to the gpu so I wonder what this is for. > + > + munmap(ptr, mmap_len); > +} > + > +static void > +run_workload(unsigned int id, struct workload *wrk, unsigned int repeat, > + enum intel_engine_id (*balance)(struct workload *wrk, > + struct w_step *w), bool seqnos) > +{ > + struct timespec t_start, t_end; > + struct w_step *w; > + double t; > + int i, j; > + > + clock_gettime(CLOCK_MONOTONIC, &t_start); > + > + srand(t_start.tv_nsec); > + > + for (j = 0; j < repeat; j++) { > + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { > + enum intel_engine_id engine = w->engine; > + uint32_t seqno; > + bool seqno_updated = false; > + int do_sleep = 0; > + > + if (i == 0) > + clock_gettime(CLOCK_MONOTONIC, > + &wrk->repeat_start); > + > + if (w->type == DELAY) { > + do_sleep = w->wait; > + } else if (w->type == PERIOD) { > + struct timespec now; > + > + clock_gettime(CLOCK_MONOTONIC, &now); > + do_sleep = w->wait - > + elapsed_us(&wrk->repeat_start, &now); > + if (do_sleep < 0) { > + if (!quiet) { > + printf("%u: Dropped period @ %u/%u (%dus late)!\n", > + id, j, i, do_sleep); > + continue; > + } > + } > + } else if (w->type == SYNC) { > + unsigned int s_idx = i + w->wait; > + > + igt_assert(i > 0 && i < wrk->nr_steps); > + igt_assert(wrk->steps[s_idx].type == BATCH); > + gem_sync(fd, wrk->steps[s_idx].obj[0].handle); > + continue; > + } > + > + if (do_sleep) { > + usleep(do_sleep); > + continue; > + } > + > + wrk->nr_bb[engine]++; > + > + if (engine == VCS && balance) { > + engine = balance(wrk, w); > + wrk->nr_bb[engine]++; > + > + w->obj[1].handle = wrk->status_page_handle[engine]; > + > + w->eb.flags = eb_engine_map[engine]; > + w->eb.flags |= I915_EXEC_HANDLE_LUT; > + } > + > + seqno = ++wrk->seqno[engine]; > + > + if (w->duration.min != w->duration.max) { > + unsigned int cur = get_duration(&w->duration); > + > + if (cur != w->duration.cur) { > + unterminate_bb(w, seqnos); Ah, you said this was for adjusting runlength of the batches. I suggest using batch_start_offset to change the number of nops rather than rewrite the batch. I need to study this a bit more... -Chris -- Chris Wilson, Intel Open Source Technology Centre _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx