From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Tool which emits batch buffers to engines with configurable sequences, durations, contexts, dependencies and userspace waits. Unfinished but shows promise so sending out for early feedback. v2: * Load workload descriptors from files. (also -w) * Help text. * Calibration control if needed. (-t) * NORELOC | LUT to eb flags. * Added sample workload to wsim/workload1. v3: * Multiple parallel different workloads (-w -w ...). * Multi-context workloads. * Variable (random) batch length. * Load balancing (round robin and queue depth estimation). * Workloads delays and explicit sync steps. * Workload frequency (period) control. TODO list: * Fence support. * Move majority of help text to README. * Better error handling. * Less 1980's workload parsing. * Proper workloads. * Explicit waits? * Threads? * ... ? Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@xxxxxxxxx> --- benchmarks/Makefile.sources | 1 + benchmarks/gem_wsim.c | 1053 +++++++++++++++++++++++++++++++++++++++++++ benchmarks/wsim/workload1 | 7 + benchmarks/wsim/workload2 | 7 + benchmarks/wsim/workload3 | 7 + benchmarks/wsim/workload4 | 8 + benchmarks/wsim/workload5 | 8 + benchmarks/wsim/workload6 | 8 + 8 files changed, 1099 insertions(+) create mode 100644 benchmarks/gem_wsim.c create mode 100644 benchmarks/wsim/workload1 create mode 100644 benchmarks/wsim/workload2 create mode 100644 benchmarks/wsim/workload3 create mode 100644 benchmarks/wsim/workload4 create mode 100644 benchmarks/wsim/workload5 create mode 100644 benchmarks/wsim/workload6 diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources index 3af54ebe36f2..3a941150abb3 100644 --- a/benchmarks/Makefile.sources +++ b/benchmarks/Makefile.sources @@ -14,6 +14,7 @@ benchmarks_prog_list = \ gem_prw \ gem_set_domain \ gem_syslatency \ + gem_wsim \ kms_vblank \ prime_lookup \ vgem_mmap \ diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c new file mode 100644 index 000000000000..38041da1f6e3 --- /dev/null +++ b/benchmarks/gem_wsim.c @@ -0,0 +1,1053 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <inttypes.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <time.h> +#include <assert.h> +#include <limits.h> + + +#include "intel_chipset.h" +#include "drm.h" +#include "ioctl_wrappers.h" +#include "drmtest.h" +#include "intel_io.h" + +enum intel_engine_id { + RCS, + BCS, + VCS, + VCS1, + VCS2, + VECS, + NUM_ENGINES +}; + +struct duration { + unsigned int min, max, cur; +}; + +enum w_type +{ + BATCH, + SYNC, + DELAY, + PERIOD +}; + +struct w_step +{ + /* Workload step metadata */ + enum w_type type; + unsigned int context; + unsigned int engine; + struct duration duration; + int dependency; + int wait; + + /* Implementation details */ + struct drm_i915_gem_execbuffer2 eb; + struct drm_i915_gem_exec_object2 obj[4]; + struct drm_i915_gem_relocation_entry reloc; + unsigned long bb_sz; + uint32_t bb_handle; + uint64_t seqno_offset; +}; + +struct workload +{ + unsigned int nr_steps; + struct w_step *steps; + + struct timespec repeat_start; + + unsigned int nr_ctxs; + uint32_t *ctx_id; + + unsigned long seqno[NUM_ENGINES]; + uint32_t status_page_handle[NUM_ENGINES]; + uint32_t *status_page[NUM_ENGINES]; + unsigned int vcs_rr; + + unsigned long qd_sum[NUM_ENGINES]; + unsigned long nr_bb[NUM_ENGINES]; +}; + +static const unsigned int eb_engine_map[NUM_ENGINES] = { + [RCS] = I915_EXEC_RENDER, + [BCS] = I915_EXEC_BLT, + [VCS] = I915_EXEC_BSD, + [VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1, + [VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2, + [VECS] = I915_EXEC_VEBOX +}; + +static const unsigned int nop_calibration_us = 1000; +static unsigned long nop_calibration; + +static bool quiet; +static int fd; + +/* + * Workload descriptor: + * + * ctx.engine.duration.dependency.wait,... + * <uint>.<str>.<uint>.<int <= 0>.<0|1>,... + * + * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS + * + * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1" + */ + +static const char *ring_str_map[NUM_ENGINES] = { + [RCS] = "RCS", + [BCS] = "BCS", + [VCS] = "VCS", + [VCS1] = "VCS1", + [VCS2] = "VCS2", + [VECS] = "VECS", +}; + +static struct workload *parse_workload(char *_desc) +{ + struct workload *wrk; + unsigned int nr_steps = 0; + char *desc = strdup(_desc); + char *_token, *token, *tctx = NULL, *tstart = desc; + char *field, *fctx = NULL, *fstart; + struct w_step step = { }, *steps = NULL; + unsigned int valid; + int tmp; + + while ((_token = strtok_r(tstart, ",", &tctx)) != NULL) { + tstart = NULL; + token = strdup(_token); + fstart = token; + valid = 0; + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + if (!strcasecmp(field, "d")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp <= 0) { + if (!quiet) + fprintf(stderr, + "Invalid delay at step %u!\n", + nr_steps); + return NULL; + } + + step.type = DELAY; + step.wait = tmp; + goto add_step; + } + } else if (!strcasecmp(field, "p")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp <= 0) { + if (!quiet) + fprintf(stderr, + "Invalid period at step %u!\n", + nr_steps); + return NULL; + } + + step.type = PERIOD; + step.wait = tmp; + goto add_step; + } + } else if (!strcasecmp(field, "s")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp >= 0) { + if (!quiet) + fprintf(stderr, + "Invalid sync target at step %u!\n", + nr_steps); + return NULL; + } + + step.type = SYNC; + step.wait = tmp; + goto add_step; + } + } + + tmp = atoi(field); + if (tmp < 0) { + if (!quiet) + fprintf(stderr, + "Invalid ctx id at step %u!\n", + nr_steps); + return NULL; + } + step.context = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + unsigned int i, old_valid = valid; + + fstart = NULL; + + for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) { + if (!strcasecmp(field, ring_str_map[i])) { + step.engine = i; + valid++; + break; + } + } + + if (old_valid == valid) { + if (!quiet) + fprintf(stderr, + "Invalid engine id at step %u!\n", + nr_steps); + return NULL; + } + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + char *sep = NULL; + long int tmpl; + + fstart = NULL; + + tmpl = strtol(field, &sep, 10); + if (tmpl == LONG_MIN || tmpl == LONG_MAX) { + if (!quiet) + fprintf(stderr, + "Invalid duration at step %u!\n", + nr_steps); + return NULL; + } + step.duration.min = tmpl; + + if (sep && *sep == '-') { + tmpl = strtol(sep + 1, NULL, 10); + if (tmpl == LONG_MIN || tmpl == LONG_MAX) { + if (!quiet) + fprintf(stderr, + "Invalid duration range at step %u!\n", + nr_steps); + return NULL; + } + step.duration.max = tmpl; + } else { + step.duration.max = step.duration.min; + } + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp > 0) { + if (!quiet) + fprintf(stderr, + "Invalid forward dependency at step %u!\n", + nr_steps); + return NULL; + } + step.dependency = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp != 0 && tmp != 1) { + if (!quiet) + fprintf(stderr, + "Invalid wait boolean at step %u!\n", + nr_steps); + return NULL; + } + step.wait = tmp; + + valid++; + } + + if (valid != 5) { + if (!quiet) + fprintf(stderr, "Invalid record at step %u!\n", + nr_steps); + return NULL; + } + + step.type = BATCH; + +add_step: + nr_steps++; + steps = realloc(steps, sizeof(step) * nr_steps); + igt_assert(steps); + + memcpy(&steps[nr_steps - 1], &step, sizeof(step)); + + free(token); + } + + wrk = malloc(sizeof(*wrk)); + igt_assert(wrk); + + wrk->nr_steps = nr_steps; + wrk->steps = steps; + + free(desc); + + return wrk; +} + +static struct workload * +clone_workload(struct workload *_wrk) +{ + struct workload *wrk; + + wrk = malloc(sizeof(*wrk)); + igt_assert(wrk); + memset(wrk, 0, sizeof(*wrk)); + + wrk->nr_steps = _wrk->nr_steps; + wrk->steps = calloc(wrk->nr_steps, sizeof(struct w_step)); + igt_assert(wrk->steps); + + memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps); + + return wrk; +} + +#define rounddown(x, y) (x - (x%y)) +#ifndef PAGE_SIZE +#define PAGE_SIZE (4096) +#endif + +static unsigned int get_duration(struct duration *dur) +{ + if (dur->min == dur->max) + return dur->min; + else + return dur->min + rand() % (dur->max + 1 - dur->min); +} + +static unsigned long __get_bb_sz(unsigned int duration) +{ + return ALIGN(duration * nop_calibration * sizeof(uint32_t) / + nop_calibration_us, sizeof(uint32_t)); +} + +static unsigned long get_bb_sz(struct duration *dur) +{ + return __get_bb_sz(dur->cur); +} + +static void +__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno) +{ + const uint32_t bbe = 0xa << 23; + unsigned long bb_sz = get_bb_sz(&w->duration); + unsigned long mmap_start, cmd_offset, mmap_len; + uint32_t *ptr, *cs; + + mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t); + cmd_offset = bb_sz - mmap_len; + mmap_start = rounddown(cmd_offset, PAGE_SIZE); + mmap_len += cmd_offset - mmap_start; + + gem_set_domain(fd, w->bb_handle, + I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU); + + ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE); + cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start); + + if (seqnos) { + const int gen = intel_gen(intel_get_drm_devid(fd)); + + igt_assert(gen >= 8); + + w->reloc.offset = bb_sz - 4 * sizeof(uint32_t); + w->seqno_offset = bb_sz - 2 * sizeof(uint32_t); + + *cs++ = terminate ? MI_STORE_DWORD_IMM : 0; + *cs++ = 0; + *cs++ = 0; + *cs++ = seqno; + } + + *cs = terminate ? bbe : 0; + + munmap(ptr, mmap_len); +} + +static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno) +{ + __emit_bb_end(w, true, seqnos, seqno); +} + +static void unterminate_bb(struct w_step *w, bool seqnos) +{ + __emit_bb_end(w, false, seqnos, 0); +} + +static void +prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos) +{ + int max_ctx = -1; + struct w_step *w; + int i; + + if (seqnos) { + const unsigned int status_sz = sizeof(uint32_t); + + for (i = 0; i < NUM_ENGINES; i++) { + wrk->status_page_handle[i] = gem_create(fd, status_sz); + wrk->status_page[i] = + gem_mmap__cpu(fd, wrk->status_page_handle[i], + 0, status_sz, PROT_READ); + } + } + + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + if ((int)w->context > max_ctx) { + int delta = w->context + 1 - wrk->nr_ctxs; + + wrk->nr_ctxs += delta; + wrk->ctx_id = realloc(wrk->ctx_id, + wrk->nr_ctxs * sizeof(uint32_t)); + memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0, + delta * sizeof(uint32_t)); + + max_ctx = w->context; + } + + if (!wrk->ctx_id[w->context]) { + struct drm_i915_gem_context_create arg = {}; + + drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg); + igt_assert(arg.ctx_id); + + wrk->ctx_id[w->context] = arg.ctx_id; + } + } + + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + enum intel_engine_id engine = w->engine; + unsigned int bb_i, j = 0; + + if (w->type != BATCH) + continue; + + w->obj[j].handle = gem_create(fd, 4096); + w->obj[j].flags = EXEC_OBJECT_WRITE; + j++; + + if (seqnos) { + w->obj[j].handle = wrk->status_page_handle[engine]; + w->obj[j].flags = EXEC_OBJECT_WRITE; + j++; + } + + bb_i = j++; + w->duration.cur = w->duration.max; + w->bb_sz = get_bb_sz(&w->duration); + w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz); + terminate_bb(w, seqnos, 0); + if (seqnos) { + w->reloc.presumed_offset = -1; + w->reloc.target_handle = 1; + w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION; + w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION; + } + + igt_assert(w->dependency <= 0); + if (w->dependency) { + int dep_idx = i + w->dependency; + + igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps); + igt_assert(wrk->steps[dep_idx].type == BATCH); + + w->obj[j].handle = w->obj[bb_i].handle; + bb_i = j; + w->obj[j - 1].handle = + wrk->steps[dep_idx].obj[0].handle; + j++; + } + + if (seqnos) { + w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc); + w->obj[bb_i].relocation_count = 1; + } + + w->eb.buffers_ptr = to_user_pointer(w->obj); + w->eb.buffer_count = j; + w->eb.rsvd1 = wrk->ctx_id[w->context]; + + if (swap_vcs && engine == VCS1) + engine = VCS2; + else if (swap_vcs && engine == VCS2) + engine = VCS1; + w->eb.flags = eb_engine_map[engine]; + w->eb.flags |= I915_EXEC_HANDLE_LUT; + if (!seqnos) + w->eb.flags |= I915_EXEC_NO_RELOC; +#ifdef DEBUG + printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n", + i, w->eb.buffer_count, w->obj[0].handle, + w->obj[1].handle, w->obj[2].handle, w->obj[3].handle, + w->bb_sz, w->eb.flags, w->bb_handle, bb_i, + w->context, wrk->ctx_id[w->context]); +#endif + } +} + +static double elapsed(const struct timespec *start, const struct timespec *end) +{ + return (end->tv_sec - start->tv_sec) + + (end->tv_nsec - start->tv_nsec) / 1e9; +} + +static int elapsed_us(const struct timespec *start, const struct timespec *end) +{ + return (1e9 * (end->tv_sec - start->tv_sec) + + (end->tv_nsec - start->tv_nsec)) / 1e3; +} + +static enum intel_engine_id +rr_balance(struct workload *wrk, struct w_step *w) +{ + unsigned int engine; + + if (wrk->vcs_rr) + engine = VCS2; + else + engine = VCS1; + + wrk->vcs_rr ^= 1; + + return engine; +} + +static enum intel_engine_id +qd_balance(struct workload *wrk, struct w_step *w) +{ + unsigned long qd[NUM_ENGINES]; + enum intel_engine_id engine = w->engine; + + igt_assert(engine == VCS); + + qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[VCS1][0]; + wrk->qd_sum[VCS1] += qd[VCS1]; + + qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[VCS2][0]; + wrk->qd_sum[VCS2] += qd[VCS2]; + + if (qd[VCS1] < qd[VCS2]) { + engine = VCS1; + wrk->vcs_rr = 0; + } else if (qd[VCS2] < qd[VCS1]) { + engine = VCS2; + wrk->vcs_rr = 1; + } else { + unsigned int vcs = wrk->vcs_rr ^ 1; + + wrk->vcs_rr = vcs; + + if (vcs == 0) + engine = VCS1; + else + engine = VCS2; + } + +// printf("qd_balance: 1:%lu 2:%lu rr:%u = %u\n", qd[VCS1], qd[VCS2], wrk->vcs_rr, engine); + + return engine; +} + +static void update_bb_seqno(struct w_step *w, uint32_t seqno) +{ + unsigned long mmap_start, mmap_offset, mmap_len; + void *ptr; + + mmap_start = rounddown(w->seqno_offset, PAGE_SIZE); + mmap_offset = w->seqno_offset - mmap_start; + mmap_len = sizeof(uint32_t) + mmap_offset; + + gem_set_domain(fd, w->bb_handle, + I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU); + + ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE); + + *(uint32_t *)((char *)ptr + mmap_offset) = seqno; + + munmap(ptr, mmap_len); +} + +static void +run_workload(unsigned int id, struct workload *wrk, unsigned int repeat, + enum intel_engine_id (*balance)(struct workload *wrk, + struct w_step *w), bool seqnos) +{ + struct timespec t_start, t_end; + struct w_step *w; + double t; + int i, j; + + clock_gettime(CLOCK_MONOTONIC, &t_start); + + srand(t_start.tv_nsec); + + for (j = 0; j < repeat; j++) { + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + enum intel_engine_id engine = w->engine; + uint32_t seqno; + bool seqno_updated = false; + int do_sleep = 0; + + if (i == 0) + clock_gettime(CLOCK_MONOTONIC, + &wrk->repeat_start); + + if (w->type == DELAY) { + do_sleep = w->wait; + } else if (w->type == PERIOD) { + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC, &now); + do_sleep = w->wait - + elapsed_us(&wrk->repeat_start, &now); + if (do_sleep < 0) { + if (!quiet) { + printf("%u: Dropped period @ %u/%u (%dus late)!\n", + id, j, i, do_sleep); + continue; + } + } + } else if (w->type == SYNC) { + unsigned int s_idx = i + w->wait; + + igt_assert(i > 0 && i < wrk->nr_steps); + igt_assert(wrk->steps[s_idx].type == BATCH); + gem_sync(fd, wrk->steps[s_idx].obj[0].handle); + continue; + } + + if (do_sleep) { + usleep(do_sleep); + continue; + } + + wrk->nr_bb[engine]++; + + if (engine == VCS && balance) { + engine = balance(wrk, w); + wrk->nr_bb[engine]++; + + w->obj[1].handle = wrk->status_page_handle[engine]; + + w->eb.flags = eb_engine_map[engine]; + w->eb.flags |= I915_EXEC_HANDLE_LUT; + } + + seqno = ++wrk->seqno[engine]; + + if (w->duration.min != w->duration.max) { + unsigned int cur = get_duration(&w->duration); + + if (cur != w->duration.cur) { + unterminate_bb(w, seqnos); + w->duration.cur = cur; + terminate_bb(w, seqnos, seqno); + seqno_updated = true; + } + } + + if (seqnos && !seqno_updated) + update_bb_seqno(w, seqno); + + gem_execbuf(fd, &w->eb); + + if (w->wait) + gem_sync(fd, w->obj[0].handle); + } + } + + clock_gettime(CLOCK_MONOTONIC, &t_end); + + t = elapsed(&t_start, &t_end); + if (!quiet && !balance) + printf("%u: %.3fs elapsed (%.3f workloads/s)\n", id, t, repeat / t); + if (!quiet && balance == rr_balance) + printf("%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches.\n", + id, t, repeat / t, + wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2]); + if (!quiet && balance == qd_balance) + printf("%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches. Average queue depths %.3f, %.3f.\n", + id, t, repeat / t, + wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2], + (double)wrk->qd_sum[VCS1] / wrk->nr_bb[VCS], + (double)wrk->qd_sum[VCS2] / wrk->nr_bb[VCS]); +} + +static void fini_workload(struct workload *wrk) +{ + free(wrk->steps); + free(wrk); +} + +static unsigned long calibrate_nop(unsigned int tolerance_pct) +{ + const uint32_t bbe = 0xa << 23; + unsigned int loops = 17; + unsigned int usecs = nop_calibration_us; + struct drm_i915_gem_exec_object2 obj = {}; + struct drm_i915_gem_execbuffer2 eb = + { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj}; + long size, last_size; + struct timespec t_0, t_end; + + clock_gettime(CLOCK_MONOTONIC, &t_0); + + size = 256 * 1024; + do { + struct timespec t_start; + + obj.handle = gem_create(fd, size); + gem_write(fd, obj.handle, size - sizeof(bbe), &bbe, + sizeof(bbe)); + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + + clock_gettime(CLOCK_MONOTONIC, &t_start); + for (int loop = 0; loop < loops; loop++) + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + gem_close(fd, obj.handle); + + last_size = size; + size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs; + size = ALIGN(size, sizeof(uint32_t)); + } while (elapsed(&t_0, &t_end) < 5 || + abs(size - last_size) > (size * tolerance_pct / 100)); + + return size / sizeof(uint32_t); +} + +static void print_help(void) +{ + puts( +"Usage: gem_wsim [OPTIONS]\n" +"\n" +"Runs a simulated workload on the GPU.\n" +"When ran without arguments performs a GPU calibration result of which needs\n" +"to be provided when running the simulation in subsequent invocations.\n" +"\n" +"Options:\n" +" -h This text.\n" +" -q Be quiet - do not output anything to stdout.\n" +" -n <n> Nop calibration value.\n" +" -t <n> Nop calibration tolerance percentage.\n" +" Use when there is a difficuly obtaining calibration\n" +" with the default settings.\n" +" -w <desc|path> Filename or a workload descriptor.\n" +" Can be given multiple times.\n" +" -r <n> How many times to emit the workload.\n" +" -c <n> Fork n clients emitting the workload simultaneously.\n" +" -x Swap VCS1 and VCS2 engines in every other client.\n" +" -s Track batch sequence numbers.\n" +" -b <n> Load balancing to use. (0: rr, 1: qd)\n" +"\n" +"Workload descriptor format:\n" +"\n" +" ctx.engine.duration_us.dependency.wait,...\n" +" <uint>.<str>.<uint>[-<uint>].<int <= 0>.<0|1>,...\n" +" d|p.<uiny>,...\n" +"\n" +" For duration a range can be given from which a random value will be\n" +" picked before every submit. Since this and seqno management requirea\n" +" CPU access to objects care needs to be taken in order to ensure the\n" +" submit queue is deep enough these operations do not affect the\n" +" execution speed unless that is desired.\n" +"\n" +" Additional workload steps are also supported:\n" +" * 'd' - adds a delay (in microseconds).\n" +" * 'p' - adds a delay relative to the start of previous loop so that\n" +" the each loop starts execution with a given period.\n" +" * 's' - synchronises the pipeline to a batch relative to the step.\n" +"\n" +" Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS\n" +"\n" +"Example:\n" +" 1.VCS1.3000.0.1\n" +" 1.RCS.500-1000.-1.0\n" +" d.1000\n" +" 1.RCS.3700.0.0\n" +" 1.RCS.1000.-2.0\n" +" 1.VCS2.2300.-2.0\n" +" 1.RCS.4700.-1.0\n" +" 1.VCS2.600.-1.1\n" +" p.16000\n" +"\n" +"The above workload described in human language works like this:\n" +"A batch is sent to the VCS1 engine which will be executing for 3ms on the\n" +"GPU and userspace will wait until it is finished before proceeding.\n" +"Now three batches are sent to RCS with durations of 0.5-1.5ms (random, 3.7ms\n" +"and 1ms respectively. The first batch has a data dependency on the preceding\n" +"VCS1 batch, and the last of the group depends on the first from the group.\n" +"Now a 2.3ms batch is sent to VCS2, with a data dependency on the 3.7ms RCS\n" +"batch, followed by a 4.7ms RCS batch with a data dependency on the 2.3ms\n" +"VCS2 batch, and finally a 0.6ms VCS2 batch depending on the previous RCS one.\n" +"The tool is then told to wait for the last one to complete before optionally\n" +"starting the next iteration (-r).\n" +"\n" +"When workload descriptors are provided on the command line, commas must be\n" +"used instead of newlines.\n" + ); +} + +static char *load_workload_descriptor(char *filename) +{ + struct stat sbuf; + char *buf; + int infd, ret, i; + ssize_t len; + + ret = stat(filename, &sbuf); + if (ret || !S_ISREG(sbuf.st_mode)) + return filename; + + igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */ + buf = malloc(sbuf.st_size); + igt_assert(buf); + + infd = open(filename, O_RDONLY); + igt_assert(infd >= 0); + len = read(infd, buf, sbuf.st_size); + igt_assert(len == sbuf.st_size); + close(infd); + + for (i = 0; i < len; i++) { + if (buf[i] == '\n') + buf[i] = ','; + } + + len--; + while (buf[len] == ',') + buf[len--] = 0; + + return buf; +} + +static char ** +add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg) +{ + w_args = realloc(w_args, sizeof(char *) * nr_args); + igt_assert(w_args); + w_args[nr_args - 1] = w_arg; + + return w_args; +} + +int main(int argc, char **argv) +{ + unsigned int repeat = 1; + unsigned int clients = 1; + bool seqnos = false; + bool swap_vcs = false; + struct timespec t_start, t_end; + struct workload **w, **wrk = NULL; + unsigned int nr_w_args = 0; + char **w_args = NULL; + unsigned int tolerance_pct = 1; + enum intel_engine_id (*balance)(struct workload *, struct w_step *) = NULL; + double t; + int i, c; + + fd = drm_open_driver(DRIVER_INTEL); + + while ((c = getopt(argc, argv, "c:n:r:qxw:t:sb:h")) != -1) { + switch (c) { + case 'w': + w_args = add_workload_arg(w_args, ++nr_w_args, optarg); + break; + case 'c': + clients = strtol(optarg, NULL, 0); + break; + case 't': + tolerance_pct = strtol(optarg, NULL, 0); + break; + case 'n': + nop_calibration = strtol(optarg, NULL, 0); + break; + case 'r': + repeat = strtol(optarg, NULL, 0); + break; + case 'q': + quiet = true; + break; + case 'x': + swap_vcs = true; + break; + case 's': + seqnos = true; + break; + case 'b': + switch (strtol(optarg, NULL, 0)) { + case 0: + balance = rr_balance; + break; + case 1: + balance = qd_balance; + break; + default: + if (!quiet) + fprintf(stderr, + "Unknown balancing mode '%s'!\n", + optarg); + return 1; + } + break; + case 'h': + print_help(); + return 0; + default: + return 1; + } + } + + if (!nop_calibration) { + if (!quiet) + printf("Calibrating nop delay with %u%% tolerance...\n", + tolerance_pct); + nop_calibration = calibrate_nop(tolerance_pct); + if (!quiet) + printf("Nop calibration for %uus delay is %lu.\n", + nop_calibration_us, nop_calibration); + + return 0; + } + + if (!nr_w_args) { + if (!quiet) + fprintf(stderr, "No workload descriptor(s)!\n"); + return 1; + } + + if (nr_w_args > 1 && clients > 1) { + if (!quiet) + fprintf(stderr, + "Cloned clients cannot be combined with multiple workloads!\n"); + return 1; + } + + wrk = calloc(nr_w_args, sizeof(*wrk)); + igt_assert(wrk); + + for (i = 0; i < nr_w_args; i++) { + w_args[i] = load_workload_descriptor(w_args[i]); + if (!w_args[i]) { + if (!quiet) + fprintf(stderr, + "Failed to load workload descriptor %u!\n", + i); + return 1; + } + + wrk[i] = parse_workload(w_args[i]); + if (!wrk[i]) { + if (!quiet) + fprintf(stderr, + "Failed to parse workload %u!\n", i); + return 1; + } + } + + if (!quiet) { + printf("Using %lu nop calibration for %uus delay.\n", + nop_calibration, nop_calibration_us); + if (nr_w_args > 1) + clients = nr_w_args; + printf("%u client%s.\n", clients, clients > 1 ? "s" : ""); + if (swap_vcs) + printf("Swapping VCS rings between clients.\n"); + } + + if (balance && !seqnos) { + if (!quiet) + fprintf(stderr, "Seqnos are required for load-balancing!\n"); + return 1; + } + + w = calloc(clients, sizeof(struct workload *)); + igt_assert(w); + + for (i = 0; i < clients; i++) { + w[i] = clone_workload(wrk[nr_w_args > 1 ? i : 0]); + prepare_workload(w[i], swap_vcs && (i & 1), seqnos); + } + + clock_gettime(CLOCK_MONOTONIC, &t_start); + + igt_fork(child, clients) + run_workload(child, w[child], repeat, balance, seqnos); + + igt_waitchildren(); + + clock_gettime(CLOCK_MONOTONIC, &t_end); + + t = elapsed(&t_start, &t_end); + if (!quiet) + printf("%.3fs elapsed (%.3f workloads/s)\n", + t, clients * repeat / t); + + for (i = 0; i < clients; i++) + fini_workload(w[i]); + free(w); + for (i = 0; i < nr_w_args; i++) + fini_workload(wrk[i]); + free(w_args); + + return 0; +} diff --git a/benchmarks/wsim/workload1 b/benchmarks/wsim/workload1 new file mode 100644 index 000000000000..5f533d8e168b --- /dev/null +++ b/benchmarks/wsim/workload1 @@ -0,0 +1,7 @@ +1.VCS1.3000.0.1 +1.RCS.1000.-1.0 +1.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS2.2300.-2.0 +1.RCS.4700.-1.0 +1.VCS2.600.-1.1 diff --git a/benchmarks/wsim/workload2 b/benchmarks/wsim/workload2 new file mode 100644 index 000000000000..25a692032eae --- /dev/null +++ b/benchmarks/wsim/workload2 @@ -0,0 +1,7 @@ +1.VCS.3000.0.1 +1.RCS.1000.-1.0 +1.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS.2300.-2.0 +1.RCS.4700.-1.0 +1.VCS.600.-1.1 diff --git a/benchmarks/wsim/workload3 b/benchmarks/wsim/workload3 new file mode 100644 index 000000000000..bc9f6df52775 --- /dev/null +++ b/benchmarks/wsim/workload3 @@ -0,0 +1,7 @@ +1.VCS.3000.0.0 +1.RCS.500-1500.-1.0 +0.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS.2300.-2.0 +2.RCS.4700.-1.0 +1.VCS.600.-1.0 diff --git a/benchmarks/wsim/workload4 b/benchmarks/wsim/workload4 new file mode 100644 index 000000000000..3e4720a6949c --- /dev/null +++ b/benchmarks/wsim/workload4 @@ -0,0 +1,8 @@ +1.VCS.3000.0.0 +1.RCS.500-1500.-1.0 +d.1000 +0.RCS.3700.0.0 +1.RCS.1000.-3.0 +1.VCS.2300.-2.0 +2.RCS.4700.-1.0 +1.VCS.600.-1.0 diff --git a/benchmarks/wsim/workload5 b/benchmarks/wsim/workload5 new file mode 100644 index 000000000000..65440a8264ef --- /dev/null +++ b/benchmarks/wsim/workload5 @@ -0,0 +1,8 @@ +1.VCS.3000.0.0 +1.RCS.500-1500.-1.0 +0.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS.2300.-2.0 +2.RCS.4700.-1.0 +1.VCS.600.-1.0 +p.16000 diff --git a/benchmarks/wsim/workload6 b/benchmarks/wsim/workload6 new file mode 100644 index 000000000000..d5b7141dfdd0 --- /dev/null +++ b/benchmarks/wsim/workload6 @@ -0,0 +1,8 @@ +1.VCS.3000.0.0 +1.RCS.500-1500.-1.0 +s.-1 +0.RCS.3700.0.0 +1.RCS.1000.-3.0 +1.VCS.2300.-2.0 +2.RCS.4700.-1.0 +1.VCS.600.-1.0 -- 2.9.3 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx