From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Tool which emits batch buffers to engines with configurable sequences, durations, contexts, dependencies and userspace waits. Unfinished but shows promise so sending out for early feedback. v2: * Load workload descriptors from files. (also -w) * Help text. * Calibration control if needed. (-t) * NORELOC | LUT to eb flags. * Added sample workload to wsim/workload1. v3: * Multiple parallel different workloads (-w -w ...). * Multi-context workloads. * Variable (random) batch length. * Load balancing (round robin and queue depth estimation). * Workloads delays and explicit sync steps. * Workload frequency (period) control. v4: * Fixed queue-depth estimation by creating separate batches per engine when qd load balancing is on. * Dropped separate -s cmd line option. It can turn itself on automatically when needed. * Keep a single status page and lie about the write hazard as suggested by Chris. * Use batch_start_offset for controlling the batch duration. (Chris) * Set status page object cache level. (Chris) * Moved workload description to a README. * Tidied example workloads. * Some other cleanups and refactorings. TODO list: * Fence support. * Better error handling. * Less 1980's workload parsing. * Proper workloads. * Threads? * ... ? Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@xxxxxxxxx> --- Comparing some test workloads under load balancing it seems that it is starting to work, but it still needs more thorough verification. For example, round- robin balancing: # benchmarks/gem_wsim -n 585341 \ -w benchmarks/wsim/vcs1.wsim \ -w benchmarks/wsim/vcs_balanced.wsim \ -r 100 -b 0 Using 585341 nop calibration for 1000us delay. 2 clients. 1: 3.008s elapsed (33.243 workloads/s). 2500 (1250 + 1250) total VCS batches. 0: 4.455s elapsed (22.449 workloads/s). 0 (2500 + 0) total VCS batches. 4.455s elapsed (44.889 workloads/s) Versus the queue-depth estimation: # benchmarks/gem_wsim -n 585341 \ -w benchmarks/wsim/vcs1.wsim \ -w benchmarks/wsim/vcs_balanced.wsim \ -r 100 -b 1 Using 585341 nop calibration for 1000us delay. 2 clients. 1: 2.239s elapsed (44.659 workloads/s). 2500 (837 + 1663) total VCS batches. Average queue depths 27.575, 19.285. 0: 4.012s elapsed (24.928 workloads/s). 0 (2500 + 0) total VCS batches. Average queue depths -nan, -nan. 4.012s elapsed (49.845 workloads/s) In both cases we run two workloads, one which only submits to VCS1 and one which can be load-balanced. The latter gets a ~33% boost with queue-depth estimation, and the non-balancing workload ~10%. --- benchmarks/Makefile.sources | 1 + benchmarks/gem_wsim.c | 1014 ++++++++++++++++++++++++++ benchmarks/wsim/README | 54 ++ benchmarks/wsim/media_17i7.wsim | 7 + benchmarks/wsim/media_load_balance_17i7.wsim | 7 + benchmarks/wsim/vcs1.wsim | 25 + benchmarks/wsim/vcs_balanced.wsim | 25 + 7 files changed, 1133 insertions(+) create mode 100644 benchmarks/gem_wsim.c create mode 100644 benchmarks/wsim/README create mode 100644 benchmarks/wsim/media_17i7.wsim create mode 100644 benchmarks/wsim/media_load_balance_17i7.wsim create mode 100644 benchmarks/wsim/vcs1.wsim create mode 100644 benchmarks/wsim/vcs_balanced.wsim diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources index 3af54ebe36f2..3a941150abb3 100644 --- a/benchmarks/Makefile.sources +++ b/benchmarks/Makefile.sources @@ -14,6 +14,7 @@ benchmarks_prog_list = \ gem_prw \ gem_set_domain \ gem_syslatency \ + gem_wsim \ kms_vblank \ prime_lookup \ vgem_mmap \ diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c new file mode 100644 index 000000000000..adf2d6decf12 --- /dev/null +++ b/benchmarks/gem_wsim.c @@ -0,0 +1,1014 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <inttypes.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <time.h> +#include <assert.h> +#include <limits.h> + + +#include "intel_chipset.h" +#include "drm.h" +#include "ioctl_wrappers.h" +#include "drmtest.h" +#include "intel_io.h" + +enum intel_engine_id { + RCS, + BCS, + VCS, + VCS1, + VCS2, + VECS, + NUM_ENGINES +}; + +struct duration { + unsigned int min, max; +}; + +enum w_type +{ + BATCH, + SYNC, + DELAY, + PERIOD +}; + +struct w_step +{ + /* Workload step metadata */ + enum w_type type; + unsigned int context; + unsigned int engine; + struct duration duration; + int dependency; + int wait; + + /* Implementation details */ + unsigned int idx; + + struct w_step_eb { + struct drm_i915_gem_execbuffer2 eb; + struct drm_i915_gem_exec_object2 obj[4]; + struct drm_i915_gem_relocation_entry reloc; + unsigned long bb_sz; + uint32_t bb_handle; + uint32_t *mapped_batch, *mapped_seqno; + unsigned int mapped_len; + } b[2]; /* One for each VCS when load balancing */ +}; + +struct workload +{ + unsigned int nr_steps; + struct w_step *steps; + + struct timespec repeat_start; + + unsigned int nr_ctxs; + uint32_t *ctx_id; + + unsigned long seqno[NUM_ENGINES]; + uint32_t status_page_handle; + uint32_t *status_page; + unsigned int vcs_rr; + + unsigned long qd_sum[NUM_ENGINES]; + unsigned long nr_bb[NUM_ENGINES]; +}; + +static const unsigned int eb_engine_map[NUM_ENGINES] = { + [RCS] = I915_EXEC_RENDER, + [BCS] = I915_EXEC_BLT, + [VCS] = I915_EXEC_BSD, + [VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1, + [VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2, + [VECS] = I915_EXEC_VEBOX +}; + +static const unsigned int nop_calibration_us = 1000; +static unsigned long nop_calibration; + +static bool quiet; +static int fd; + +#define SWAPVCS (1<<0) +#define SEQNO (1<<1) +#define BALANCE (1<<2) + +/* + * Workload descriptor: + * + * ctx.engine.duration.dependency.wait,... + * <uint>.<str>.<uint>.<int <= 0>.<0|1>,... + * + * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS + * + * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1" + */ + +static const char *ring_str_map[NUM_ENGINES] = { + [RCS] = "RCS", + [BCS] = "BCS", + [VCS] = "VCS", + [VCS1] = "VCS1", + [VCS2] = "VCS2", + [VECS] = "VECS", +}; + +static struct workload *parse_workload(char *_desc) +{ + struct workload *wrk; + unsigned int nr_steps = 0; + char *desc = strdup(_desc); + char *_token, *token, *tctx = NULL, *tstart = desc; + char *field, *fctx = NULL, *fstart; + struct w_step step, *steps = NULL; + unsigned int valid; + int tmp; + + while ((_token = strtok_r(tstart, ",", &tctx)) != NULL) { + tstart = NULL; + token = strdup(_token); + fstart = token; + valid = 0; + memset(&step, 0, sizeof(step)); + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + if (!strcasecmp(field, "d")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp <= 0) { + if (!quiet) + fprintf(stderr, + "Invalid delay at step %u!\n", + nr_steps); + return NULL; + } + + step.type = DELAY; + step.wait = tmp; + goto add_step; + } + } else if (!strcasecmp(field, "p")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp <= 0) { + if (!quiet) + fprintf(stderr, + "Invalid period at step %u!\n", + nr_steps); + return NULL; + } + + step.type = PERIOD; + step.wait = tmp; + goto add_step; + } + } else if (!strcasecmp(field, "s")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp >= 0) { + if (!quiet) + fprintf(stderr, + "Invalid sync target at step %u!\n", + nr_steps); + return NULL; + } + + step.type = SYNC; + step.wait = tmp; + goto add_step; + } + } + + tmp = atoi(field); + if (tmp < 0) { + if (!quiet) + fprintf(stderr, + "Invalid ctx id at step %u!\n", + nr_steps); + return NULL; + } + step.context = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + unsigned int i, old_valid = valid; + + fstart = NULL; + + for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) { + if (!strcasecmp(field, ring_str_map[i])) { + step.engine = i; + valid++; + break; + } + } + + if (old_valid == valid) { + if (!quiet) + fprintf(stderr, + "Invalid engine id at step %u!\n", + nr_steps); + return NULL; + } + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + char *sep = NULL; + long int tmpl; + + fstart = NULL; + + tmpl = strtol(field, &sep, 10); + if (tmpl == LONG_MIN || tmpl == LONG_MAX) { + if (!quiet) + fprintf(stderr, + "Invalid duration at step %u!\n", + nr_steps); + return NULL; + } + step.duration.min = tmpl; + + if (sep && *sep == '-') { + tmpl = strtol(sep + 1, NULL, 10); + if (tmpl == LONG_MIN || tmpl == LONG_MAX) { + if (!quiet) + fprintf(stderr, + "Invalid duration range at step %u!\n", + nr_steps); + return NULL; + } + step.duration.max = tmpl; + } else { + step.duration.max = step.duration.min; + } + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp > 0) { + if (!quiet) + fprintf(stderr, + "Invalid forward dependency at step %u!\n", + nr_steps); + return NULL; + } + step.dependency = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp != 0 && tmp != 1) { + if (!quiet) + fprintf(stderr, + "Invalid wait boolean at step %u!\n", + nr_steps); + return NULL; + } + step.wait = tmp; + + valid++; + } + + if (valid != 5) { + if (!quiet) + fprintf(stderr, "Invalid record at step %u!\n", + nr_steps); + return NULL; + } + + step.type = BATCH; + +add_step: + step.idx = nr_steps++; + steps = realloc(steps, sizeof(step) * nr_steps); + igt_assert(steps); + + memcpy(&steps[nr_steps - 1], &step, sizeof(step)); + + free(token); + } + + wrk = malloc(sizeof(*wrk)); + igt_assert(wrk); + + wrk->nr_steps = nr_steps; + wrk->steps = steps; + + free(desc); + + return wrk; +} + +static struct workload * +clone_workload(struct workload *_wrk) +{ + struct workload *wrk; + + wrk = malloc(sizeof(*wrk)); + igt_assert(wrk); + memset(wrk, 0, sizeof(*wrk)); + + wrk->nr_steps = _wrk->nr_steps; + wrk->steps = calloc(wrk->nr_steps, sizeof(struct w_step)); + igt_assert(wrk->steps); + + memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps); + + return wrk; +} + +#define rounddown(x, y) (x - (x%y)) +#ifndef PAGE_SIZE +#define PAGE_SIZE (4096) +#endif + +static unsigned int get_duration(struct duration *dur) +{ + if (dur->min == dur->max) + return dur->min; + else + return dur->min + rand() % (dur->max + 1 - dur->min); +} + +static unsigned long get_bb_sz(unsigned int duration) +{ + return ALIGN(duration * nop_calibration * sizeof(uint32_t) / + nop_calibration_us, sizeof(uint32_t)); +} + +static void +terminate_bb(struct w_step *w, struct w_step_eb *b, enum intel_engine_id engine, + unsigned int flags) +{ + const uint32_t bbe = 0xa << 23; + unsigned long bb_sz = get_bb_sz(w->duration.max); + unsigned long mmap_start, cmd_offset, mmap_len; + uint32_t *ptr, *cs; + + mmap_len = 1; + if (flags & SEQNO) + mmap_len += 4; + mmap_len *= sizeof(uint32_t); + cmd_offset = bb_sz - mmap_len; + mmap_start = rounddown(cmd_offset, PAGE_SIZE); + mmap_len += cmd_offset - mmap_start; + + gem_set_domain(fd, b->bb_handle, + I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU); + + ptr = gem_mmap__wc(fd, b->bb_handle, mmap_start, mmap_len, PROT_WRITE); + cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start); + + if (flags & SEQNO) { + b->reloc.offset = bb_sz - 4 * sizeof(uint32_t); + b->reloc.delta = (engine - VCS1) * sizeof(uint32_t); + + *cs++ = MI_STORE_DWORD_IMM; + *cs++ = 0; + *cs++ = 0; + b->mapped_seqno = cs; + *cs++ = 0; + } + + *cs = bbe; + + b->mapped_batch = ptr; + b->mapped_len = mmap_len; +} + +static void +alloc_step_batch(struct workload *wrk, struct w_step *w, struct w_step_eb *b, + enum intel_engine_id engine, unsigned int flags) +{ + unsigned int bb_i, j = 0; + + b->obj[j].handle = gem_create(fd, 4096); + b->obj[j].flags = EXEC_OBJECT_WRITE; + j++; + + if (flags & SEQNO) { + b->obj[j].handle = wrk->status_page_handle; + j++; + } + + bb_i = j++; + b->bb_sz = get_bb_sz(w->duration.max); + b->bb_handle = b->obj[bb_i].handle = gem_create(fd, b->bb_sz); + terminate_bb(w, b, engine, flags); + + igt_assert(w->dependency <= 0); + if (w->dependency) { + int dep_idx = w->idx + w->dependency; + + igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps); + igt_assert(wrk->steps[dep_idx].type == BATCH); + + b->obj[j].handle = b->obj[bb_i].handle; + bb_i = j; + b->obj[j - 1].handle = wrk->steps[dep_idx].b[0].obj[0].handle; + j++; + + if (wrk->steps[dep_idx].b[1].obj[0].handle) { + b->obj[j].handle = b->obj[bb_i].handle; + bb_i = j; + b->obj[j - 1].handle = + wrk->steps[dep_idx].b[1].obj[0].handle; + j++; + } + } + + if (flags & SEQNO) { + b->reloc.presumed_offset = -1; + b->reloc.target_handle = 1; + b->obj[bb_i].relocs_ptr = to_user_pointer(&b->reloc); + b->obj[bb_i].relocation_count = 1; + } + + b->eb.buffers_ptr = to_user_pointer(b->obj); + b->eb.buffer_count = j; + b->eb.rsvd1 = wrk->ctx_id[w->context]; + + if (flags & SWAPVCS && engine == VCS1) + engine = VCS2; + else if (flags & SWAPVCS && engine == VCS2) + engine = VCS1; + b->eb.flags = eb_engine_map[engine]; + b->eb.flags |= I915_EXEC_HANDLE_LUT; + if (!(flags & SEQNO)) + b->eb.flags |= I915_EXEC_NO_RELOC; +#ifdef DEBUG + printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n", + w->idx, b->eb.buffer_count, b->obj[0].handle, + b->obj[1].handle, b->obj[2].handle, b->obj[3].handle, + b->bb_sz, b->eb.flags, b->bb_handle, bb_i, + w->context, wrk->ctx_id[w->context]); +#endif +} + +static void +prepare_workload(struct workload *wrk, unsigned int flags) +{ + int max_ctx = -1; + struct w_step *w; + int i; + + if (flags & SEQNO) { + const unsigned int status_sz = sizeof(uint32_t); + uint32_t handle = gem_create(fd, status_sz); + + gem_set_caching(fd, handle, I915_CACHING_CACHED); + wrk->status_page_handle = handle; + wrk->status_page = gem_mmap__cpu(fd, handle, 0, status_sz, + PROT_READ); + } + + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + if ((int)w->context > max_ctx) { + int delta = w->context + 1 - wrk->nr_ctxs; + + wrk->nr_ctxs += delta; + wrk->ctx_id = realloc(wrk->ctx_id, + wrk->nr_ctxs * sizeof(uint32_t)); + memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0, + delta * sizeof(uint32_t)); + + max_ctx = w->context; + } + + if (!wrk->ctx_id[w->context]) { + struct drm_i915_gem_context_create arg = {}; + + drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg); + igt_assert(arg.ctx_id); + + wrk->ctx_id[w->context] = arg.ctx_id; + } + } + + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + unsigned int _flags = flags; + enum intel_engine_id engine = w->engine; + + if (w->type != BATCH) + continue; + + if (engine != VCS && engine != VCS1 && engine != VCS2) + _flags &= ~SEQNO; + + if (engine == VCS) + _flags &= ~SWAPVCS; + + if (engine == VCS && flags & BALANCE) { + alloc_step_batch(wrk, w, &w->b[0], VCS1, _flags); + alloc_step_batch(wrk, w, &w->b[1], VCS2, _flags); + } else { + alloc_step_batch(wrk, w, &w->b[0], engine, _flags); + } + } +} + +static double elapsed(const struct timespec *start, const struct timespec *end) +{ + return (end->tv_sec - start->tv_sec) + + (end->tv_nsec - start->tv_nsec) / 1e9; +} + +static int elapsed_us(const struct timespec *start, const struct timespec *end) +{ + return elapsed(start, end) * 1e6; +} + +static enum intel_engine_id get_vcs_engine(unsigned int n) +{ + const enum intel_engine_id vcs_engines[2] = { VCS1, VCS2 }; + + igt_assert(n < ARRAY_SIZE(vcs_engines)); + + return vcs_engines[n]; +} + + +static enum intel_engine_id +rr_balance(struct workload *wrk, struct w_step *w) +{ + unsigned int engine; + + engine = get_vcs_engine(wrk->vcs_rr); + wrk->vcs_rr ^= 1; + + return engine; +} + +static enum intel_engine_id +qd_balance(struct workload *wrk, struct w_step *w) +{ + enum intel_engine_id engine = w->engine; + long qd[NUM_ENGINES]; + unsigned int n; + + igt_assert(engine == VCS); + + qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[0]; + wrk->qd_sum[VCS1] += qd[VCS1]; + + qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[1]; + wrk->qd_sum[VCS2] += qd[VCS2]; + + if (qd[VCS1] < qd[VCS2]) + n = 0; + else if (qd[VCS2] < qd[VCS1]) + n = 1; + else + n = wrk->vcs_rr; + + engine = get_vcs_engine(n); + wrk->vcs_rr = n ^ 1; + +#ifdef DEBUG + printf("qd_balance: 1:%ld 2:%ld rr:%u = %u\t(%lu - %u) (%lu - %u)\n", + qd[VCS1], qd[VCS2], wrk->vcs_rr, engine, + wrk->seqno[VCS1], wrk->status_page[0], + wrk->seqno[VCS2], wrk->status_page[1]); +#endif + return engine; +} + +static void +update_bb_seqno(struct w_step_eb *b, enum intel_engine_id engine, + uint32_t seqno) +{ + *b->mapped_seqno = seqno; + b->reloc.delta = (engine - VCS1) * sizeof(uint32_t); +} + +static void +run_workload(unsigned int id, struct workload *wrk, unsigned int repeat, + enum intel_engine_id (*balance)(struct workload *wrk, + struct w_step *w), + unsigned int flags) +{ + struct timespec t_start, t_end; + struct w_step *w; + double t; + int i, j; + + clock_gettime(CLOCK_MONOTONIC, &t_start); + + srand(t_start.tv_nsec); + + for (j = 0; j < repeat; j++) { + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + enum intel_engine_id engine = w->engine; + struct w_step_eb *b = &w->b[0]; + int do_sleep = 0; + + if (i == 0) + clock_gettime(CLOCK_MONOTONIC, + &wrk->repeat_start); + + if (w->type == DELAY) { + do_sleep = w->wait; + } else if (w->type == PERIOD) { + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC, &now); + do_sleep = w->wait - + elapsed_us(&wrk->repeat_start, &now); + if (do_sleep < 0) { + if (!quiet) { + printf("%u: Dropped period @ %u/%u (%dus late)!\n", + id, j, i, do_sleep); + continue; + } + } + } else if (w->type == SYNC) { + unsigned int s_idx = i + w->wait; + + igt_assert(i > 0 && i < wrk->nr_steps); + igt_assert(wrk->steps[s_idx].type == BATCH); + gem_sync(fd, wrk->steps[s_idx].b[0].obj[0].handle); + if (wrk->steps[s_idx].b[1].obj[0].handle) + gem_sync(fd, wrk->steps[s_idx].b[1].obj[0].handle); + continue; + } + + if (do_sleep) { + usleep(do_sleep); + continue; + } + + wrk->nr_bb[engine]++; + + if (engine == VCS && balance) { + engine = balance(wrk, w); + wrk->nr_bb[engine]++; + b = &w->b[engine - VCS1]; + + if (flags & SEQNO) + update_bb_seqno(b, engine, + ++wrk->seqno[engine]); + } + + if (w->duration.min != w->duration.max) { + unsigned int d = get_duration(&w->duration); + unsigned long offset; + + offset = ALIGN(b->bb_sz - get_bb_sz(d), + 2 * sizeof(uint32_t)); + b->eb.batch_start_offset = offset; + } + + gem_execbuf(fd, &b->eb); + + if (w->wait) + gem_sync(fd, b->obj[0].handle); + } + } + + gem_sync(fd, wrk->steps[wrk->nr_steps - 1].b[0].obj[0].handle); + if (wrk->steps[wrk->nr_steps - 1].b[1].obj[0].handle) + gem_sync(fd, wrk->steps[wrk->nr_steps - 1].b[1].obj[0].handle); + + clock_gettime(CLOCK_MONOTONIC, &t_end); + + t = elapsed(&t_start, &t_end); + if (!quiet && !balance) + printf("%u: %.3fs elapsed (%.3f workloads/s)\n", id, t, repeat / t); + if (!quiet && balance == rr_balance) + printf("%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches.\n", + id, t, repeat / t, + wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2]); + if (!quiet && balance == qd_balance) + printf("%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches. Average queue depths %.3f, %.3f.\n", + id, t, repeat / t, + wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2], + (double)wrk->qd_sum[VCS1] / wrk->nr_bb[VCS], + (double)wrk->qd_sum[VCS2] / wrk->nr_bb[VCS]); +} + +static void fini_workload(struct workload *wrk) +{ + free(wrk->steps); + free(wrk); +} + +static unsigned long calibrate_nop(unsigned int tolerance_pct) +{ + const uint32_t bbe = 0xa << 23; + unsigned int loops = 17; + unsigned int usecs = nop_calibration_us; + struct drm_i915_gem_exec_object2 obj = {}; + struct drm_i915_gem_execbuffer2 eb = + { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj}; + long size, last_size; + struct timespec t_0, t_end; + + clock_gettime(CLOCK_MONOTONIC, &t_0); + + size = 256 * 1024; + do { + struct timespec t_start; + + obj.handle = gem_create(fd, size); + gem_write(fd, obj.handle, size - sizeof(bbe), &bbe, + sizeof(bbe)); + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + + clock_gettime(CLOCK_MONOTONIC, &t_start); + for (int loop = 0; loop < loops; loop++) + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + gem_close(fd, obj.handle); + + last_size = size; + size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs; + size = ALIGN(size, sizeof(uint32_t)); + } while (elapsed(&t_0, &t_end) < 5 || + abs(size - last_size) > (size * tolerance_pct / 100)); + + return size / sizeof(uint32_t); +} + +static void print_help(void) +{ + puts( +"Usage: gem_wsim [OPTIONS]\n" +"\n" +"Runs a simulated workload on the GPU.\n" +"When ran without arguments performs a GPU calibration result of which needs\n" +"to be provided when running the simulation in subsequent invocations.\n" +"\n" +"Options:\n" +" -h This text.\n" +" -q Be quiet - do not output anything to stdout.\n" +" -n <n> Nop calibration value.\n" +" -t <n> Nop calibration tolerance percentage.\n" +" Use when there is a difficuly obtaining calibration\n" +" with the default settings.\n" +" -w <desc|path> Filename or a workload descriptor.\n" +" Can be given multiple times.\n" +" -r <n> How many times to emit the workload.\n" +" -c <n> Fork n clients emitting the workload simultaneously.\n" +" -x Swap VCS1 and VCS2 engines in every other client.\n" +" -b <n> Load balancing to use. (0: rr, 1: qd)\n" + ); +} + +static char *load_workload_descriptor(char *filename) +{ + struct stat sbuf; + char *buf; + int infd, ret, i; + ssize_t len; + + ret = stat(filename, &sbuf); + if (ret || !S_ISREG(sbuf.st_mode)) + return filename; + + igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */ + buf = malloc(sbuf.st_size); + igt_assert(buf); + + infd = open(filename, O_RDONLY); + igt_assert(infd >= 0); + len = read(infd, buf, sbuf.st_size); + igt_assert(len == sbuf.st_size); + close(infd); + + for (i = 0; i < len; i++) { + if (buf[i] == '\n') + buf[i] = ','; + } + + len--; + while (buf[len] == ',') + buf[len--] = 0; + + return buf; +} + +static char ** +add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg) +{ + w_args = realloc(w_args, sizeof(char *) * nr_args); + igt_assert(w_args); + w_args[nr_args - 1] = w_arg; + + return w_args; +} + +int main(int argc, char **argv) +{ + unsigned int repeat = 1; + unsigned int clients = 1; + unsigned int flags = 0; + struct timespec t_start, t_end; + struct workload **w, **wrk = NULL; + unsigned int nr_w_args = 0; + char **w_args = NULL; + unsigned int tolerance_pct = 1; + enum intel_engine_id (*balance)(struct workload *, struct w_step *) = NULL; + double t; + int i, c; + + fd = drm_open_driver(DRIVER_INTEL); + + while ((c = getopt(argc, argv, "c:n:r:qxw:t:b:h")) != -1) { + switch (c) { + case 'w': + w_args = add_workload_arg(w_args, ++nr_w_args, optarg); + break; + case 'c': + clients = strtol(optarg, NULL, 0); + break; + case 't': + tolerance_pct = strtol(optarg, NULL, 0); + break; + case 'n': + nop_calibration = strtol(optarg, NULL, 0); + break; + case 'r': + repeat = strtol(optarg, NULL, 0); + break; + case 'q': + quiet = true; + break; + case 'x': + flags |= SWAPVCS; + break; + case 'b': + switch (strtol(optarg, NULL, 0)) { + case 0: + balance = rr_balance; + flags |= BALANCE; + break; + case 1: + igt_assert(intel_gen(intel_get_drm_devid(fd)) >= + 8); + balance = qd_balance; + flags |= SEQNO | BALANCE; + break; + default: + if (!quiet) + fprintf(stderr, + "Unknown balancing mode '%s'!\n", + optarg); + return 1; + } + break; + case 'h': + print_help(); + return 0; + default: + return 1; + } + } + + if (!nop_calibration) { + if (!quiet) + printf("Calibrating nop delay with %u%% tolerance...\n", + tolerance_pct); + nop_calibration = calibrate_nop(tolerance_pct); + if (!quiet) + printf("Nop calibration for %uus delay is %lu.\n", + nop_calibration_us, nop_calibration); + + return 0; + } + + if (!nr_w_args) { + if (!quiet) + fprintf(stderr, "No workload descriptor(s)!\n"); + return 1; + } + + if (nr_w_args > 1 && clients > 1) { + if (!quiet) + fprintf(stderr, + "Cloned clients cannot be combined with multiple workloads!\n"); + return 1; + } + + wrk = calloc(nr_w_args, sizeof(*wrk)); + igt_assert(wrk); + + for (i = 0; i < nr_w_args; i++) { + w_args[i] = load_workload_descriptor(w_args[i]); + if (!w_args[i]) { + if (!quiet) + fprintf(stderr, + "Failed to load workload descriptor %u!\n", + i); + return 1; + } + + wrk[i] = parse_workload(w_args[i]); + if (!wrk[i]) { + if (!quiet) + fprintf(stderr, + "Failed to parse workload %u!\n", i); + return 1; + } + } + + if (!quiet) { + printf("Using %lu nop calibration for %uus delay.\n", + nop_calibration, nop_calibration_us); + if (nr_w_args > 1) + clients = nr_w_args; + printf("%u client%s.\n", clients, clients > 1 ? "s" : ""); + if (flags & SWAPVCS) + printf("Swapping VCS rings between clients.\n"); + } + + w = calloc(clients, sizeof(struct workload *)); + igt_assert(w); + + for (i = 0; i < clients; i++) { + unsigned int flags_ = flags; + + w[i] = clone_workload(wrk[nr_w_args > 1 ? i : 0]); + + if (flags & SWAPVCS && i & 1) + flags_ &= ~SWAPVCS; + + prepare_workload(w[i], flags_); + } + + clock_gettime(CLOCK_MONOTONIC, &t_start); + + igt_fork(child, clients) + run_workload(child, w[child], repeat, balance, flags); + + igt_waitchildren(); + + clock_gettime(CLOCK_MONOTONIC, &t_end); + + t = elapsed(&t_start, &t_end); + if (!quiet) + printf("%.3fs elapsed (%.3f workloads/s)\n", + t, clients * repeat / t); + + for (i = 0; i < clients; i++) + fini_workload(w[i]); + free(w); + for (i = 0; i < nr_w_args; i++) + fini_workload(wrk[i]); + free(w_args); + + return 0; +} diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README new file mode 100644 index 000000000000..b55e620c61c2 --- /dev/null +++ b/benchmarks/wsim/README @@ -0,0 +1,54 @@ +Workload descriptor format +========================== + +ctx.engine.duration_us.dependency.wait,... +<uint>.<str>.<uint>[-<uint>].<int <= 0>.<0|1>,... +d|p|s.<uiny>,... + +For duration a range can be given from which a random value will be picked +before every submit. Since this and seqno management requires CPU access to +objects, care needs to be taken in order to ensure the submit queue is deep +enough these operations do not affect the execution speed unless that is +desired. + +Additional workload steps are also supported: + + 'd' - Adds a delay (in microseconds). + 'p' - Adds a delay relative to the start of previous loop so that the each loop + starts execution with a given period. + 's' - Synchronises the pipeline to a batch relative to the step. + +Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS + +Example (leading spaces must not be present in the actual file): +---------------------------------------------------------------- + + 1.VCS1.3000.0.1 + 1.RCS.500-1000.-1.0 + 1.RCS.3700.0.0 + 1.RCS.1000.-2.0 + 1.VCS2.2300.-2.0 + 1.RCS.4700.-1.0 + 1.VCS2.600.-1.1 + p.16000 + +The above workload described in human language works like this: + + 1. A batch is sent to the VCS1 engine which will be executing for 3ms on the + GPU and userspace will wait until it is finished before proceeding. + 2-4. Now three batches are sent to RCS with durations of 0.5-1.5ms (random + duration range), 3.7ms and 1ms respectively. The first batch has a data + dependency on the preceding VCS1 batch, and the last of the group depends + on the first from the group. + 5. Now a 2.3ms batch is sent to VCS2, with a data dependency on the 3.7ms + RCS batch. + 6. This is followed by a 4.7ms RCS batch with a data dependency on the 2.3ms + VCS2 batch. + 7. Then a 0.6ms VCS2 batch is sent depending on the previous RCS one. In the + same step the tool is told to wait for the batch completes before + proceeding. + 8. Finally the tool is told to wait long enough to ensure the next iteration + starts 16ms after the previous one has started. + +When workload descriptors are provided on the command line, commas must be used +instead of new lines. diff --git a/benchmarks/wsim/media_17i7.wsim b/benchmarks/wsim/media_17i7.wsim new file mode 100644 index 000000000000..5f533d8e168b --- /dev/null +++ b/benchmarks/wsim/media_17i7.wsim @@ -0,0 +1,7 @@ +1.VCS1.3000.0.1 +1.RCS.1000.-1.0 +1.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS2.2300.-2.0 +1.RCS.4700.-1.0 +1.VCS2.600.-1.1 diff --git a/benchmarks/wsim/media_load_balance_17i7.wsim b/benchmarks/wsim/media_load_balance_17i7.wsim new file mode 100644 index 000000000000..25a692032eae --- /dev/null +++ b/benchmarks/wsim/media_load_balance_17i7.wsim @@ -0,0 +1,7 @@ +1.VCS.3000.0.1 +1.RCS.1000.-1.0 +1.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS.2300.-2.0 +1.RCS.4700.-1.0 +1.VCS.600.-1.1 diff --git a/benchmarks/wsim/vcs1.wsim b/benchmarks/wsim/vcs1.wsim new file mode 100644 index 000000000000..e1986aadd65c --- /dev/null +++ b/benchmarks/wsim/vcs1.wsim @@ -0,0 +1,25 @@ +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 diff --git a/benchmarks/wsim/vcs_balanced.wsim b/benchmarks/wsim/vcs_balanced.wsim new file mode 100644 index 000000000000..9a4b3d785db1 --- /dev/null +++ b/benchmarks/wsim/vcs_balanced.wsim @@ -0,0 +1,25 @@ +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 -- 2.9.3 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx