From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Tool which emits batch buffers to engines with configurable sequences, durations, contexts, dependencies and userspace waits. Unfinished but shows promise so sending out for early feedback. v2: * Load workload descriptors from files. (also -w) * Help text. * Calibration control if needed. (-t) * NORELOC | LUT to eb flags. * Added sample workload to wsim/workload1. TODO list: * Better error handling. * Multi-context support for individual clients. * Random/variable batch length. * Load balancing plug-in. * ... ? Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@xxxxxxxxx> gem_wsim updates Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> --- benchmarks/Makefile.sources | 1 + benchmarks/gem_wsim.c | 593 ++++++++++++++++++++++++++++++++++++++++++++ benchmarks/wsim/workload1 | 7 + 3 files changed, 601 insertions(+) create mode 100644 benchmarks/gem_wsim.c create mode 100644 benchmarks/wsim/workload1 diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources index 3af54ebe36f2..3a941150abb3 100644 --- a/benchmarks/Makefile.sources +++ b/benchmarks/Makefile.sources @@ -14,6 +14,7 @@ benchmarks_prog_list = \ gem_prw \ gem_set_domain \ gem_syslatency \ + gem_wsim \ kms_vblank \ prime_lookup \ vgem_mmap \ diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c new file mode 100644 index 000000000000..029967281251 --- /dev/null +++ b/benchmarks/gem_wsim.c @@ -0,0 +1,593 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <inttypes.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <time.h> +#include <assert.h> + +#include "drm.h" +#include "ioctl_wrappers.h" +#include "drmtest.h" +#include "intel_io.h" + +struct w_step +{ + /* Workload step metadata */ + unsigned int context; + unsigned int engine; + unsigned int duration; + int dependency; + int wait; + + /* Implementation details */ + struct drm_i915_gem_execbuffer2 eb; + struct drm_i915_gem_exec_object2 obj[3]; +}; + +struct workload +{ + unsigned int nr_steps; + struct w_step *steps; + + uint32_t ctx_id; +}; + +enum intel_engine_id { + RCS, + BCS, + balance_VCS, + VCS, + VCS1, + VCS2, + VECS, + NUM_ENGINES +}; + +static const unsigned int eb_engine_map[NUM_ENGINES] = { + [RCS] = I915_EXEC_RENDER, + [BCS] = I915_EXEC_BLT, + [balance_VCS] = I915_EXEC_BSD, + [VCS] = I915_EXEC_BSD, + [VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1, + [VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2, + [VECS] = I915_EXEC_VEBOX }; + +static const uint32_t bbe = 0xa << 23; +static const unsigned int nop_calibration_us = 1000; +static unsigned long nop_calibration; + +static bool quiet; +static int fd; + +/* + * Workload descriptor: + * + * ctx.engine.duration.dependency.wait,... + * <uint>.<str>.<uint>.<int <= 0>.<0|1>,... + * + * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS + * + * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1" + */ + +static struct workload *parse_workload(char *desc) +{ + struct workload *wrk; + unsigned int nr_steps = 0; + char *token, *tctx, *tstart = desc; + char *field, *fctx, *fstart; + struct w_step step, *steps = NULL; + unsigned int valid; + int tmp; + + while ((token = strtok_r(tstart, ",", &tctx)) != NULL) { + tstart = NULL; + fstart = token; + valid = 0; + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp != 1) { + if (!quiet) + fprintf(stderr, + "Invalid ctx id at step %u!\n", + nr_steps); + return NULL; + } + step.context = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + if (!strcasecmp(field, "RCS")) { + step.engine = RCS; + valid++; + } else if (!strcasecmp(field, "BCS")) { + step.engine = BCS; + valid++; + } else if (!strcasecmp(field, "balance_VCS")) { + step.engine = balance_VCS; + valid++; + } else if (!strcasecmp(field, "VCS")) { + step.engine = VCS; + valid++; + } else if (!strcasecmp(field, "VCS1")) { + step.engine = VCS1; + valid++; + } else if (!strcasecmp(field, "VCS2")) { + step.engine = VCS2; + valid++; + } else if (!strcasecmp(field, "VECS")) { + step.engine = VECS; + valid++; + } else { + if (!quiet) + fprintf(stderr, + "Invalid engine id at step %u!\n", + nr_steps); + return NULL; + } + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp <= 0) { + if (!quiet) + fprintf(stderr, + "Invalid duration at step %u!\n", + nr_steps); + return NULL; + } + step.duration = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp > 0) { + if (!quiet) + fprintf(stderr, + "Invalid forward dependency at step %u!\n", + nr_steps); + return NULL; + } + step.dependency = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp != 0 && tmp != 1) { + if (!quiet) + fprintf(stderr, + "Invalid wait boolean at step %u!\n", + nr_steps); + return NULL; + } + step.wait = tmp; + + valid++; + } + + if (valid != 5) { + if (!quiet) + fprintf(stderr, "Invalid record at step %u!\n", + nr_steps); + return NULL; + } + + nr_steps++; + steps = realloc(steps, sizeof(step) * nr_steps); + igt_assert(steps); + + memcpy(&steps[nr_steps - 1], &step, sizeof(step)); + } + + wrk = malloc(sizeof(*wrk)); + igt_assert(wrk); + + wrk->nr_steps = nr_steps; + wrk->steps = steps; + + return wrk; +} + +static struct workload * +clone_workload(struct workload *_wrk) +{ + struct workload *wrk; + + wrk = malloc(sizeof(*wrk)); + igt_assert(wrk); + + wrk->nr_steps = _wrk->nr_steps; + wrk->steps = malloc(sizeof(struct w_step) * wrk->nr_steps); + igt_assert(wrk->steps); + + memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps); + + return wrk; +} + +static void prepare_workload(struct workload *wrk, bool swap_vcs) +{ + struct drm_i915_gem_context_create arg = {}; + struct w_step *w; + int i; + + drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg); + wrk->ctx_id = arg.ctx_id; + + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + memset(&w->eb, 0, sizeof(w->eb)); + memset(&w->obj, 0, sizeof(w->obj)); + } + + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + unsigned long sz; + enum intel_engine_id engine = w->engine; + + sz = ALIGN(w->duration * nop_calibration * sizeof(uint32_t) / + nop_calibration_us, sizeof(uint32_t)); + + igt_assert(w->context == 1); /* TODO */ + + w->obj[0].handle = gem_create(fd, 4096); + w->obj[0].flags = EXEC_OBJECT_WRITE; + + w->obj[1].handle = gem_create(fd, sz); + gem_write(fd, w->obj[1].handle, sz - sizeof(bbe), &bbe, + sizeof(bbe)); + + w->eb.buffer_count = 2; + w->eb.buffers_ptr = to_user_pointer(w->obj); + if (swap_vcs && engine == VCS1) + engine = VCS2; + else if (swap_vcs && engine == VCS2) + engine = VCS1; + w->eb.flags = eb_engine_map[engine]; + w->eb.flags |= I915_EXEC_NO_RELOC; + w->eb.flags |= I915_EXEC_HANDLE_LUT; + w->eb.rsvd1 = wrk->ctx_id; + + igt_assert(w->dependency <= 0); + if (w->dependency) { + int dep_idx = i + w->dependency; + + igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps); + + w->obj[2].handle = w->obj[1].handle; + w->obj[1].handle = wrk->steps[dep_idx].obj[0].handle; + w->eb.buffer_count = 3; + } + +#ifdef DEBUG + printf("%u: %u:%x|%x|%x %10lu flags=%llx\n", + i, w->eb.buffer_count, + w->obj[0].handle, w->obj[1].handle, w->obj[2].handle, + sz, w->eb.flags); +#endif + } +} + +static double elapsed(const struct timespec *start, const struct timespec *end) +{ + return (end->tv_sec - start->tv_sec) + + (end->tv_nsec - start->tv_nsec) / 1e9; +} + +static void +run_workload(unsigned int id, struct workload *wrk, unsigned int repeat) +{ + struct timespec t_start, t_end; + struct w_step *w; + double t; + int i, j; + + clock_gettime(CLOCK_MONOTONIC, &t_start); + + for (j = 0; j < repeat; j++) { + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + gem_execbuf(fd, &w->eb); + if (w->wait) + gem_sync(fd, w->obj[0].handle); + } + } + + clock_gettime(CLOCK_MONOTONIC, &t_end); + + t = elapsed(&t_start, &t_end); + if (!quiet) + printf("%u: %fs elapsed (%f workloads/s)\n", id, t, repeat / t); +} + +static void fini_workload(struct workload *wrk) +{ + free(wrk->steps); + free(wrk); +} + +static unsigned long calibrate_nop(unsigned int tolerance_pct) +{ + unsigned int loops = 17; + unsigned int usecs = nop_calibration_us; + struct drm_i915_gem_exec_object2 obj = {}; + struct drm_i915_gem_execbuffer2 eb = + { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj}; + long size, last_size; + struct timespec t_0, t_end; + + clock_gettime(CLOCK_MONOTONIC, &t_0); + + size = 256 * 1024; + do { + struct timespec t_start; + + obj.handle = gem_create(fd, size); + gem_write(fd, obj.handle, size - sizeof(bbe), &bbe, + sizeof(bbe)); + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + + clock_gettime(CLOCK_MONOTONIC, &t_start); + for (int loop = 0; loop < loops; loop++) + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + gem_close(fd, obj.handle); + + last_size = size; + size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs; + size = ALIGN(size, sizeof(uint32_t)); + } while (elapsed(&t_0, &t_end) < 5 || + abs(size - last_size) > (size * tolerance_pct / 100)); + + return size / sizeof(uint32_t); +} + +static void print_help(void) +{ + puts( +"Usage: gem_wsim [OPTIONS]\n" +"\n" +"Runs a simulated workload on the GPU.\n" +"When ran without arguments performs a GPU calibration result of which needs\n" +"to be provided when running the simulation in subsequent invocations.\n" +"\n" +"Options:\n" +" -h This text.\n" +" -q Be quiet - do not output anything to stdout.\n" +" -n <n> Nop calibration value.\n" +" -t <n> Nop calibration tolerance percentage.\n" +" Use when there is a difficuly obtaining calibration\n" +" with the default settings.\n" +" -w <desc|path> Filename or a workload descriptor.\n" +" -r <n> How many times to emit the workload.\n" +" -c <n> Fork n clients emitting the workload simultaneously.\n" +" -x Swap VCS1 and VCS2 engines in every other client.\n" +"\n" +"Workload descriptor format:\n" +"\n" +" ctx.engine.duration_us.dependency.wait,...\n" +" <uint>.<str>.<uint>.<int <= 0>.<0|1>,...\n" +"\n" +" Engine ids: RCS, BCS, balance_VCS, VCS, VCS1, VCS2, VECS\n" +"\n" +"Example:\n" +" 1.VCS1.3000.0.1\n" +" 1.RCS.1000.-1.0\n" +" 1.RCS.3700.0.0\n" +" 1.RCS.1000.-2.0\n" +" 1.VCS2.2300.-2.0\n" +" 1.RCS.4700.-1.0\n" +" 1.VCS2.600.-1.1\n" +"\n" +"The above workload described in human language works like this:\n" +"A batch is sent to the VCS1 engine which will be executing for 3ms on the\n" +"GPU and userspace will wait until it is finished before proceeding.\n" +"Now three batches are sent to RCS with durations of 1ms, 3.7ms and 1ms\n" +"respectively. The first batch has a data dependency on the preceding VCS1\n" +"batch, and the last of the group depends on the first from the group.\n" +"Now a 2.3ms batch is sent to VCS2, with a data dependency on the 3.7ms RCS\n" +"batch, followed by a 4.7ms RCS batch with a data dependency on the 2.3ms\n" +"VCS2 batch, and finally a 0.6ms VCS2 batch depending on the previous RCS one.\n" +"The tool is then told to wait for the last one to complete before optionally\n" +"starting the next iteration (-r).\n" +"\n" +"When workload descriptors are provided on the command line, commas must be\n" +"used instead of newlines.\n" + ); +} + +static char *load_workload_descriptor(char *filename) +{ + struct stat sbuf; + char *buf; + int infd, ret, i; + ssize_t len; + + ret = stat(filename, &sbuf); + if (ret || !S_ISREG(sbuf.st_mode)) + return filename; + + igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */ + buf = malloc(sbuf.st_size); + igt_assert(buf); + + infd = open(filename, O_RDONLY); + igt_assert(infd >= 0); + len = read(infd, buf, sbuf.st_size); + igt_assert(len == sbuf.st_size); + close(infd); + + for (i = 0; i < len; i++) { + if (buf[i] == '\n') + buf[i] = ','; + } + + return buf; +} + +int main(int argc, char **argv) +{ + unsigned int repeat = 1; + unsigned int clients = 1; + bool swap_vcs = false; + struct timespec t_start, t_end; + struct workload **w, *wrk; + char *w_str = NULL; + unsigned int tolerance_pct = 1; + double t; + int i, c; + + fd = drm_open_driver(DRIVER_INTEL); + + while ((c = getopt(argc, argv, "c:n:r:qxw:t:h")) != -1) { + switch (c) { + case 'w': + w_str = optarg; + break; + case 'c': + clients = strtol(optarg, NULL, 0); + break; + case 't': + tolerance_pct = strtol(optarg, NULL, 0); + break; + case 'n': + nop_calibration = strtol(optarg, NULL, 0); + break; + case 'r': + repeat = strtol(optarg, NULL, 0); + break; + case 'q': + quiet = true; + break; + case 'x': + swap_vcs = true; + break; + case 'h': + print_help(); + return 0; + default: + return 1; + } + } + + if (!nop_calibration) { + if (!quiet) + printf("Calibrating nop delay with %u%% tolerance...\n", + tolerance_pct); + nop_calibration = calibrate_nop(tolerance_pct); + if (!quiet) + printf("Nop calibration for %uus delay is %lu.\n", + nop_calibration_us, nop_calibration); + + return 0; + } else { + if (!w_str) { + if (!quiet) + fprintf(stderr, + "Workload descriptor missing!\n"); + return 1; + } + + w_str = load_workload_descriptor(w_str); + if (!w_str) { + if (!quiet) + fprintf(stderr, + "Failed to load workload descriptor!\n"); + return 1; + } + + wrk = parse_workload(w_str); + if (!wrk) { + if (!quiet) + fprintf(stderr, "Failed to parse workload!\n"); + return 1; + } + } + + if (!quiet) { + printf("Using %lu nop calibration for %uus delay.\n", + nop_calibration, nop_calibration_us); + printf("%u client%s.\n", clients, clients > 1 ? "s" : ""); + if (swap_vcs) + printf("Swapping VCS rings between clients.\n"); + } + + w = malloc(sizeof(struct workload *) * clients); + igt_assert(w); + + for (i = 0; i < clients; i++) { + w[i] = clone_workload(wrk); + prepare_workload(w[i], swap_vcs && (i & 1)); + } + + clock_gettime(CLOCK_MONOTONIC, &t_start); + + igt_fork(child, clients) + run_workload(child, w[child], repeat); + + igt_waitchildren(); + + clock_gettime(CLOCK_MONOTONIC, &t_end); + + t = elapsed(&t_start, &t_end); + if (!quiet) + printf("%fs elapsed (%f workloads/s)\n", + t, clients * repeat / t); + + for (i = 0; i < clients; i++) + fini_workload(w[i]); + + free(w); + fini_workload(wrk); + + return 0; +} diff --git a/benchmarks/wsim/workload1 b/benchmarks/wsim/workload1 new file mode 100644 index 000000000000..5f533d8e168b --- /dev/null +++ b/benchmarks/wsim/workload1 @@ -0,0 +1,7 @@ +1.VCS1.3000.0.1 +1.RCS.1000.-1.0 +1.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS2.2300.-2.0 +1.RCS.4700.-1.0 +1.VCS2.600.-1.1 -- 2.9.3 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx