From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Tool which emits batch buffers to engines with configurable sequences, durations, contexts, dependencies and userspace waits. Unfinished but shows promise so sending out for early feedback. v2: * Load workload descriptors from files. (also -w) * Help text. * Calibration control if needed. (-t) * NORELOC | LUT to eb flags. * Added sample workload to wsim/workload1. v3: * Multiple parallel different workloads (-w -w ...). * Multi-context workloads. * Variable (random) batch length. * Load balancing (round robin and queue depth estimation). * Workloads delays and explicit sync steps. * Workload frequency (period) control. v4: * Fixed queue-depth estimation by creating separate batches per engine when qd load balancing is on. * Dropped separate -s cmd line option. It can turn itself on automatically when needed. * Keep a single status page and lie about the write hazard as suggested by Chris. * Use batch_start_offset for controlling the batch duration. (Chris) * Set status page object cache level. (Chris) * Moved workload description to a README. * Tidied example workloads. * Some other cleanups and refactorings. v5: * Master and background workloads (-W / -w). * Single batch per step is enough even when balancing. (Chris) * Use hars_petruska_f54_1_random IGT functions and see to zero at start. (Chris) * Use WC cache domain when WC mapping. (Chris) * Keep seqnos 64-bytes apart in the status page. (Chris) * Add workload throttling and queue-depth throttling commands. (Chris) TODO list: * Fence support. * Better error handling. * Less 1980's workload parsing. * Proper workloads. * Threads? * ... ? Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@xxxxxxxxx> --- benchmarks/Makefile.sources | 1 + benchmarks/gem_wsim.c | 1189 ++++++++++++++++++++++++++ benchmarks/wsim/README | 56 ++ benchmarks/wsim/media_17i7.wsim | 7 + benchmarks/wsim/media_load_balance_17i7.wsim | 7 + benchmarks/wsim/vcs1.wsim | 26 + benchmarks/wsim/vcs_balanced.wsim | 26 + lib/igt_core.c | 26 + lib/igt_core.h | 1 + 9 files changed, 1339 insertions(+) create mode 100644 benchmarks/gem_wsim.c create mode 100644 benchmarks/wsim/README create mode 100644 benchmarks/wsim/media_17i7.wsim create mode 100644 benchmarks/wsim/media_load_balance_17i7.wsim create mode 100644 benchmarks/wsim/vcs1.wsim create mode 100644 benchmarks/wsim/vcs_balanced.wsim diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources index 3af54ebe36f2..3a941150abb3 100644 --- a/benchmarks/Makefile.sources +++ b/benchmarks/Makefile.sources @@ -14,6 +14,7 @@ benchmarks_prog_list = \ gem_prw \ gem_set_domain \ gem_syslatency \ + gem_wsim \ kms_vblank \ prime_lookup \ vgem_mmap \ diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c new file mode 100644 index 000000000000..3d6670fdb815 --- /dev/null +++ b/benchmarks/gem_wsim.c @@ -0,0 +1,1189 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <unistd.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <inttypes.h> +#include <errno.h> +#include <poll.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <time.h> +#include <assert.h> +#include <limits.h> + + +#include "intel_chipset.h" +#include "drm.h" +#include "ioctl_wrappers.h" +#include "drmtest.h" +#include "intel_io.h" +#include "igt_rand.h" + +enum intel_engine_id { + RCS, + BCS, + VCS, + VCS1, + VCS2, + VECS, + NUM_ENGINES +}; + +struct duration { + unsigned int min, max; +}; + +enum w_type +{ + BATCH, + SYNC, + DELAY, + PERIOD, + THROTTLE, + QD_THROTTLE +}; + +struct w_step +{ + /* Workload step metadata */ + enum w_type type; + unsigned int context; + unsigned int engine; + struct duration duration; + int dependency; + int wait; + + /* Implementation details */ + unsigned int idx; + + struct drm_i915_gem_execbuffer2 eb; + struct drm_i915_gem_exec_object2 obj[4]; + struct drm_i915_gem_relocation_entry reloc; + unsigned long bb_sz; + uint32_t bb_handle; + uint32_t *mapped_batch, *mapped_seqno; + unsigned int mapped_len; +}; + +struct workload +{ + unsigned int nr_steps; + struct w_step *steps; + + struct timespec repeat_start; + + int pipe[2]; + + unsigned int nr_ctxs; + uint32_t *ctx_id; + + unsigned long seqno[NUM_ENGINES]; + uint32_t status_page_handle; + uint32_t *status_page; + unsigned int vcs_rr; + + unsigned long qd_sum[NUM_ENGINES]; + unsigned long nr_bb[NUM_ENGINES]; +}; + +static const unsigned int eb_engine_map[NUM_ENGINES] = { + [RCS] = I915_EXEC_RENDER, + [BCS] = I915_EXEC_BLT, + [VCS] = I915_EXEC_BSD, + [VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1, + [VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2, + [VECS] = I915_EXEC_VEBOX +}; + +static const unsigned int nop_calibration_us = 1000; +static unsigned long nop_calibration; + +static bool quiet; +static int fd; + +#define SWAPVCS (1<<0) +#define SEQNO (1<<1) +#define BALANCE (1<<2) + +#define VCS_SEQNO_IDX(vcs_instance) ((vcs_instance) * 16) + +/* + * Workload descriptor: + * + * ctx.engine.duration.dependency.wait,... + * <uint>.<str>.<uint>.<int <= 0>.<0|1>,... + * + * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS + * + * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1" + */ + +static const char *ring_str_map[NUM_ENGINES] = { + [RCS] = "RCS", + [BCS] = "BCS", + [VCS] = "VCS", + [VCS1] = "VCS1", + [VCS2] = "VCS2", + [VECS] = "VECS", +}; + +static struct workload *parse_workload(char *_desc) +{ + struct workload *wrk; + unsigned int nr_steps = 0; + char *desc = strdup(_desc); + char *_token, *token, *tctx = NULL, *tstart = desc; + char *field, *fctx = NULL, *fstart; + struct w_step step, *steps = NULL; + unsigned int valid; + int tmp; + + while ((_token = strtok_r(tstart, ",", &tctx)) != NULL) { + tstart = NULL; + token = strdup(_token); + fstart = token; + valid = 0; + memset(&step, 0, sizeof(step)); + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + if (!strcasecmp(field, "d")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp <= 0) { + if (!quiet) + fprintf(stderr, + "Invalid delay at step %u!\n", + nr_steps); + return NULL; + } + + step.type = DELAY; + step.wait = tmp; + goto add_step; + } + } else if (!strcasecmp(field, "p")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp <= 0) { + if (!quiet) + fprintf(stderr, + "Invalid period at step %u!\n", + nr_steps); + return NULL; + } + + step.type = PERIOD; + step.wait = tmp; + goto add_step; + } + } else if (!strcasecmp(field, "s")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp >= 0) { + if (!quiet) + fprintf(stderr, + "Invalid sync target at step %u!\n", + nr_steps); + return NULL; + } + + step.type = SYNC; + step.wait = tmp; + goto add_step; + } + } else if (!strcasecmp(field, "t")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp < 0) { + if (!quiet) + fprintf(stderr, + "Invalid throttle at step %u!\n", + nr_steps); + return NULL; + } + + step.type = THROTTLE; + step.wait = tmp; + goto add_step; + } + } else if (!strcasecmp(field, "q")) { + if ((field = strtok_r(fstart, ".", &fctx)) != + NULL) { + tmp = atoi(field); + if (tmp < 0) { + if (!quiet) + fprintf(stderr, + "Invalid qd throttle at step %u!\n", + nr_steps); + return NULL; + } + + step.type = QD_THROTTLE; + step.wait = tmp; + goto add_step; + } + } + + tmp = atoi(field); + if (tmp < 0) { + if (!quiet) + fprintf(stderr, + "Invalid ctx id at step %u!\n", + nr_steps); + return NULL; + } + step.context = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + unsigned int i, old_valid = valid; + + fstart = NULL; + + for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) { + if (!strcasecmp(field, ring_str_map[i])) { + step.engine = i; + valid++; + break; + } + } + + if (old_valid == valid) { + if (!quiet) + fprintf(stderr, + "Invalid engine id at step %u!\n", + nr_steps); + return NULL; + } + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + char *sep = NULL; + long int tmpl; + + fstart = NULL; + + tmpl = strtol(field, &sep, 10); + if (tmpl == LONG_MIN || tmpl == LONG_MAX) { + if (!quiet) + fprintf(stderr, + "Invalid duration at step %u!\n", + nr_steps); + return NULL; + } + step.duration.min = tmpl; + + if (sep && *sep == '-') { + tmpl = strtol(sep + 1, NULL, 10); + if (tmpl == LONG_MIN || tmpl == LONG_MAX) { + if (!quiet) + fprintf(stderr, + "Invalid duration range at step %u!\n", + nr_steps); + return NULL; + } + step.duration.max = tmpl; + } else { + step.duration.max = step.duration.min; + } + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp > 0) { + if (!quiet) + fprintf(stderr, + "Invalid forward dependency at step %u!\n", + nr_steps); + return NULL; + } + step.dependency = tmp; + + valid++; + } + + if ((field = strtok_r(fstart, ".", &fctx)) != NULL) { + fstart = NULL; + + tmp = atoi(field); + if (tmp != 0 && tmp != 1) { + if (!quiet) + fprintf(stderr, + "Invalid wait boolean at step %u!\n", + nr_steps); + return NULL; + } + step.wait = tmp; + + valid++; + } + + if (valid != 5) { + if (!quiet) + fprintf(stderr, "Invalid record at step %u!\n", + nr_steps); + return NULL; + } + + step.type = BATCH; + +add_step: + step.idx = nr_steps++; + steps = realloc(steps, sizeof(step) * nr_steps); + igt_assert(steps); + + memcpy(&steps[nr_steps - 1], &step, sizeof(step)); + + free(token); + } + + wrk = malloc(sizeof(*wrk)); + igt_assert(wrk); + + wrk->nr_steps = nr_steps; + wrk->steps = steps; + + free(desc); + + return wrk; +} + +static struct workload * +clone_workload(struct workload *_wrk) +{ + struct workload *wrk; + + wrk = malloc(sizeof(*wrk)); + igt_assert(wrk); + memset(wrk, 0, sizeof(*wrk)); + + wrk->nr_steps = _wrk->nr_steps; + wrk->steps = calloc(wrk->nr_steps, sizeof(struct w_step)); + igt_assert(wrk->steps); + + memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps); + + return wrk; +} + +#define rounddown(x, y) (x - (x%y)) +#ifndef PAGE_SIZE +#define PAGE_SIZE (4096) +#endif + +static unsigned int get_duration(struct duration *dur) +{ + if (dur->min == dur->max) + return dur->min; + else + return dur->min + hars_petruska_f54_1_random_unsafe() % + (dur->max + 1 - dur->min); +} + +static unsigned long get_bb_sz(unsigned int duration) +{ + return ALIGN(duration * nop_calibration * sizeof(uint32_t) / + nop_calibration_us, sizeof(uint32_t)); +} + +static void +terminate_bb(struct w_step *w, unsigned int flags) +{ + const uint32_t bbe = 0xa << 23; + unsigned long mmap_start, cmd_offset, mmap_len; + uint32_t *ptr, *cs; + + mmap_len = 1; + if (flags & SEQNO) + mmap_len += 4; + mmap_len *= sizeof(uint32_t); + cmd_offset = w->bb_sz - mmap_len; + mmap_start = rounddown(cmd_offset, PAGE_SIZE); + mmap_len += cmd_offset - mmap_start; + + gem_set_domain(fd, w->bb_handle, + I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC); + + ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE); + cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start); + + if (flags & SEQNO) { + w->reloc.offset = w->bb_sz - 4 * sizeof(uint32_t); + + *cs++ = MI_STORE_DWORD_IMM; + *cs++ = 0; + *cs++ = 0; + w->mapped_seqno = cs; + *cs++ = 0; + } + + *cs = bbe; + + w->mapped_batch = ptr; + w->mapped_len = mmap_len; +} + +static void +eb_update_flags(struct w_step *w, enum intel_engine_id engine, + unsigned int flags) +{ + w->eb.flags = eb_engine_map[engine]; + w->eb.flags |= I915_EXEC_HANDLE_LUT; + if (!(flags & SEQNO)) + w->eb.flags |= I915_EXEC_NO_RELOC; +} + +static void +alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags) +{ + enum intel_engine_id engine = w->engine; + unsigned int bb_i, j = 0; + + w->obj[j].handle = gem_create(fd, 4096); + w->obj[j].flags = EXEC_OBJECT_WRITE; + j++; + + if (flags & SEQNO) { + w->obj[j].handle = wrk->status_page_handle; + j++; + } + + bb_i = j++; + w->bb_sz = get_bb_sz(w->duration.max); + w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz); + terminate_bb(w, flags); + + igt_assert(w->dependency <= 0); + if (w->dependency) { + int dep_idx = w->idx + w->dependency; + + igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps); + igt_assert(wrk->steps[dep_idx].type == BATCH); + + w->obj[j].handle = w->obj[bb_i].handle; + bb_i = j; + w->obj[j - 1].handle = wrk->steps[dep_idx].obj[0].handle; + j++; + } + + if (flags & SEQNO) { + w->reloc.presumed_offset = -1; + w->reloc.target_handle = 1; + w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc); + w->obj[bb_i].relocation_count = 1; + } + + w->eb.buffers_ptr = to_user_pointer(w->obj); + w->eb.buffer_count = j; + w->eb.rsvd1 = wrk->ctx_id[w->context]; + + if (flags & SWAPVCS && engine == VCS1) + engine = VCS2; + else if (flags & SWAPVCS && engine == VCS2) + engine = VCS1; + eb_update_flags(w, engine, flags); +#ifdef DEBUG + printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n", + w->idx, w->eb.buffer_count, w->obj[0].handle, + w->obj[1].handle, w->obj[2].handle, w->obj[3].handle, + w->bb_sz, w->eb.flags, w->bb_handle, bb_i, + w->context, wrk->ctx_id[w->context]); +#endif +} + +static void +prepare_workload(struct workload *wrk, unsigned int flags) +{ + int max_ctx = -1; + struct w_step *w; + int i; + + if (flags & SEQNO) { + const unsigned int status_sz = sizeof(uint32_t); + uint32_t handle = gem_create(fd, status_sz); + + gem_set_caching(fd, handle, I915_CACHING_CACHED); + wrk->status_page_handle = handle; + wrk->status_page = gem_mmap__cpu(fd, handle, 0, status_sz, + PROT_READ); + } + + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + if ((int)w->context > max_ctx) { + int delta = w->context + 1 - wrk->nr_ctxs; + + wrk->nr_ctxs += delta; + wrk->ctx_id = realloc(wrk->ctx_id, + wrk->nr_ctxs * sizeof(uint32_t)); + memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0, + delta * sizeof(uint32_t)); + + max_ctx = w->context; + } + + if (!wrk->ctx_id[w->context]) { + struct drm_i915_gem_context_create arg = {}; + + drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg); + igt_assert(arg.ctx_id); + + wrk->ctx_id[w->context] = arg.ctx_id; + } + } + + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) { + unsigned int _flags = flags; + enum intel_engine_id engine = w->engine; + + if (w->type != BATCH) + continue; + + if (engine != VCS && engine != VCS1 && engine != VCS2) + _flags &= ~SEQNO; + + if (engine == VCS) + _flags &= ~SWAPVCS; + + alloc_step_batch(wrk, w, _flags); + } +} + +static double elapsed(const struct timespec *start, const struct timespec *end) +{ + return (end->tv_sec - start->tv_sec) + + (end->tv_nsec - start->tv_nsec) / 1e9; +} + +static int elapsed_us(const struct timespec *start, const struct timespec *end) +{ + return elapsed(start, end) * 1e6; +} + +static enum intel_engine_id get_vcs_engine(unsigned int n) +{ + const enum intel_engine_id vcs_engines[2] = { VCS1, VCS2 }; + + igt_assert(n < ARRAY_SIZE(vcs_engines)); + + return vcs_engines[n]; +} + +struct workload_balancer { + unsigned int (*get_qd)(const struct workload_balancer *balancer, + struct workload *wrk, + enum intel_engine_id engine); + enum intel_engine_id (*balance)(const struct workload_balancer *balancer, + struct workload *wrk, struct w_step *w); +}; + +static enum intel_engine_id +rr_balance(const struct workload_balancer *balancer, + struct workload *wrk, struct w_step *w) +{ + unsigned int engine; + + engine = get_vcs_engine(wrk->vcs_rr); + wrk->vcs_rr ^= 1; + + return engine; +} + +static const struct workload_balancer rr_balancer = { + .balance = rr_balance, +}; + +static unsigned int +get_qd_depth(const struct workload_balancer *balancer, + struct workload *wrk, enum intel_engine_id engine) +{ + return wrk->seqno[engine] - + wrk->status_page[VCS_SEQNO_IDX(engine - VCS1)]; +} + +static enum intel_engine_id +qd_balance(const struct workload_balancer *balancer, + struct workload *wrk, struct w_step *w) +{ + enum intel_engine_id engine; + long qd[NUM_ENGINES]; + unsigned int n; + + igt_assert(w->engine == VCS); + + qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1); + wrk->qd_sum[VCS1] += qd[VCS1]; + + qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2); + wrk->qd_sum[VCS2] += qd[VCS2]; + + if (qd[VCS1] < qd[VCS2]) + n = 0; + else if (qd[VCS2] < qd[VCS1]) + n = 1; + else + n = wrk->vcs_rr; + + engine = get_vcs_engine(n); + wrk->vcs_rr = n ^ 1; + +#ifdef DEBUG + printf("qd_balance: 1:%ld 2:%ld rr:%u = %u\t(%lu - %u) (%lu - %u)\n", + qd[VCS1], qd[VCS2], wrk->vcs_rr, engine, + wrk->seqno[VCS1], wrk->status_page[VCS_SEQNO_IDX(0)], + wrk->seqno[VCS2], wrk->status_page[VCS_SEQNO_IDX(1)]); +#endif + return engine; +} + +static const struct workload_balancer qd_balancer = { + .get_qd = get_qd_depth, + .balance = qd_balance, +}; + +static void +update_bb_seqno(struct w_step *w, enum intel_engine_id engine, uint32_t seqno) +{ + igt_assert(engine == VCS1 || engine == VCS2); + + gem_set_domain(fd, w->bb_handle, + I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC); + + *w->mapped_seqno = seqno; + w->reloc.presumed_offset = -1; + w->reloc.delta = VCS_SEQNO_IDX(engine - VCS1) * sizeof(uint32_t); +} + +static void w_sync_to(struct workload *wrk, struct w_step *w, int target) +{ + if (target < 0) + target = wrk->nr_steps + target; + + igt_assert(target < wrk->nr_steps); + + while (wrk->steps[target].type != BATCH) { + if (--target < 0) + target = wrk->nr_steps + target; + } + + igt_assert(target < wrk->nr_steps); + igt_assert(wrk->steps[target].type == BATCH); + + gem_sync(fd, wrk->steps[target].obj[0].handle); +} + +static void +run_workload(unsigned int id, struct workload *wrk, + bool background, int pipe_fd, + const struct workload_balancer *balancer, + unsigned int repeat, + unsigned int flags) +{ + struct timespec t_start, t_end; + struct w_step *w; + bool run = true; + int throttle = -1; + int qd_throttle = -1; + double t; + int i, j; + + clock_gettime(CLOCK_MONOTONIC, &t_start); + + hars_petruska_f54_1_random_seed(0); + + for (j = 0; run && (background || j < repeat); j++) { + for (i = 0, w = wrk->steps; run && (i < wrk->nr_steps); + i++, w++) { + enum intel_engine_id engine = w->engine; + int do_sleep = 0; + + if (i == 0) + clock_gettime(CLOCK_MONOTONIC, + &wrk->repeat_start); + + if (w->type == DELAY) { + do_sleep = w->wait; + } else if (w->type == PERIOD) { + struct timespec now; + + clock_gettime(CLOCK_MONOTONIC, &now); + do_sleep = w->wait - + elapsed_us(&wrk->repeat_start, &now); + if (do_sleep < 0) { + if (!quiet) { + printf("%u: Dropped period @ %u/%u (%dus late)!\n", + id, j, i, do_sleep); + continue; + } + } + } else if (w->type == SYNC) { + unsigned int s_idx = i + w->wait; + + igt_assert(i > 0 && i < wrk->nr_steps); + igt_assert(wrk->steps[s_idx].type == BATCH); + gem_sync(fd, wrk->steps[s_idx].obj[0].handle); + continue; + } else if (w->type == THROTTLE) { + throttle = w->wait; + continue; + } else if (w->type == QD_THROTTLE) { + qd_throttle = w->wait; + continue; + } + + if (do_sleep) { + usleep(do_sleep); + continue; + } + + wrk->nr_bb[engine]++; + + if (engine == VCS && balancer) { + engine = balancer->balance(balancer, wrk, w); + wrk->nr_bb[engine]++; + + eb_update_flags(w, engine, flags); + + if (flags & SEQNO) + update_bb_seqno(w, engine, + ++wrk->seqno[engine]); + } + + if (w->duration.min != w->duration.max) { + unsigned int d = get_duration(&w->duration); + unsigned long offset; + + offset = ALIGN(w->bb_sz - get_bb_sz(d), + 2 * sizeof(uint32_t)); + w->eb.batch_start_offset = offset; + } + + /* If workload want qd throttling when qd is not + * available approximate with normal throttling. */ + if (qd_throttle > 0 && throttle < 0 && + !(balancer && balancer->get_qd)) + throttle = qd_throttle; + + if (throttle > 0) + w_sync_to(wrk, w, i - throttle); + + if (qd_throttle > 0 && balancer && balancer->get_qd) { + unsigned int target; + + for (target = wrk->nr_steps - 1; target > 0; + target--) { + if (balancer->get_qd(balancer, wrk, + engine) < + qd_throttle) + break; + w_sync_to(wrk, w, i - target); + } + } + + gem_execbuf(fd, &w->eb); + + if (pipe_fd >= 0) { + struct pollfd fds; + + fds.fd = pipe_fd; + fds.events = POLLHUP; + if (poll(&fds, 1, 0)) { + run = false; + break; + } + } + + if (w->wait) + gem_sync(fd, w->obj[0].handle); + } + } + + if (run) + gem_sync(fd, wrk->steps[wrk->nr_steps - 1].obj[0].handle); + + clock_gettime(CLOCK_MONOTONIC, &t_end); + + t = elapsed(&t_start, &t_end); + if (!quiet && !balancer) + printf("%c%u: %.3fs elapsed (%.3f workloads/s)\n", + background ? ' ' : '*', id, t, repeat / t); + else if (!quiet && !balancer->get_qd) + printf("%c%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches.\n", + background ? ' ' : '*', id, t, repeat / t, + wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2]); + else if (!quiet && balancer) + printf("%c%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches. Average queue depths %.3f, %.3f.\n", + background ? ' ' : '*', id, t, repeat / t, + wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2], + (double)wrk->qd_sum[VCS1] / wrk->nr_bb[VCS], + (double)wrk->qd_sum[VCS2] / wrk->nr_bb[VCS]); +} + +static void fini_workload(struct workload *wrk) +{ + free(wrk->steps); + free(wrk); +} + +static unsigned long calibrate_nop(unsigned int tolerance_pct) +{ + const uint32_t bbe = 0xa << 23; + unsigned int loops = 17; + unsigned int usecs = nop_calibration_us; + struct drm_i915_gem_exec_object2 obj = {}; + struct drm_i915_gem_execbuffer2 eb = + { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj}; + long size, last_size; + struct timespec t_0, t_end; + + clock_gettime(CLOCK_MONOTONIC, &t_0); + + size = 256 * 1024; + do { + struct timespec t_start; + + obj.handle = gem_create(fd, size); + gem_write(fd, obj.handle, size - sizeof(bbe), &bbe, + sizeof(bbe)); + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + + clock_gettime(CLOCK_MONOTONIC, &t_start); + for (int loop = 0; loop < loops; loop++) + gem_execbuf(fd, &eb); + gem_sync(fd, obj.handle); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + gem_close(fd, obj.handle); + + last_size = size; + size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs; + size = ALIGN(size, sizeof(uint32_t)); + } while (elapsed(&t_0, &t_end) < 5 || + abs(size - last_size) > (size * tolerance_pct / 100)); + + return size / sizeof(uint32_t); +} + +static void print_help(void) +{ + puts( +"Usage: gem_wsim [OPTIONS]\n" +"\n" +"Runs a simulated workload on the GPU.\n" +"When ran without arguments performs a GPU calibration result of which needs\n" +"to be provided when running the simulation in subsequent invocations.\n" +"\n" +"Options:\n" +" -h This text.\n" +" -q Be quiet - do not output anything to stdout.\n" +" -n <n> Nop calibration value.\n" +" -t <n> Nop calibration tolerance percentage.\n" +" Use when there is a difficulty obtaining calibration\n" +" with the default settings.\n" +" -w <desc|path> Filename or a workload descriptor.\n" +" Can be given multiple times.\n" +" -W <desc|path> Filename or a master workload descriptor.\n" +" Only one master workload can be optinally specified\n" +" in which case all other workloads become background\n" +" ones and run as long as the master.\n" +" -r <n> How many times to emit the workload.\n" +" -c <n> Fork N clients emitting the workload simultaneously.\n" +" -x Swap VCS1 and VCS2 engines in every other client.\n" +" -b <n> Load balancing to use. (0: rr, 1: qd)\n" + ); +} + +static char *load_workload_descriptor(char *filename) +{ + struct stat sbuf; + char *buf; + int infd, ret, i; + ssize_t len; + + ret = stat(filename, &sbuf); + if (ret || !S_ISREG(sbuf.st_mode)) + return filename; + + igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */ + buf = malloc(sbuf.st_size); + igt_assert(buf); + + infd = open(filename, O_RDONLY); + igt_assert(infd >= 0); + len = read(infd, buf, sbuf.st_size); + igt_assert(len == sbuf.st_size); + close(infd); + + for (i = 0; i < len; i++) { + if (buf[i] == '\n') + buf[i] = ','; + } + + len--; + while (buf[len] == ',') + buf[len--] = 0; + + return buf; +} + +static char ** +add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg) +{ + w_args = realloc(w_args, sizeof(char *) * nr_args); + igt_assert(w_args); + w_args[nr_args - 1] = w_arg; + + return w_args; +} + +int main(int argc, char **argv) +{ + unsigned int repeat = 1; + unsigned int clients = 1; + unsigned int flags = 0; + struct timespec t_start, t_end; + struct workload **w, **wrk = NULL; + unsigned int nr_w_args = 0; + int master_workload = -1; + char **w_args = NULL; + unsigned int tolerance_pct = 1; + const struct workload_balancer *balancer = NULL; + double t; + int i, c; + + fd = drm_open_driver(DRIVER_INTEL); + + while ((c = getopt(argc, argv, "qc:n:r:xw:W:t:b:h")) != -1) { + switch (c) { + case 'W': + if (master_workload >= 0) { + if (!quiet) + fprintf(stderr, + "Only one master workload can be given!\n"); + return 1; + } + master_workload = nr_w_args; + /* Fall through */ + case 'w': + w_args = add_workload_arg(w_args, ++nr_w_args, optarg); + break; + case 'c': + clients = strtol(optarg, NULL, 0); + break; + case 't': + tolerance_pct = strtol(optarg, NULL, 0); + break; + case 'n': + nop_calibration = strtol(optarg, NULL, 0); + break; + case 'r': + repeat = strtol(optarg, NULL, 0); + break; + case 'q': + quiet = true; + break; + case 'x': + flags |= SWAPVCS; + break; + case 'b': + switch (strtol(optarg, NULL, 0)) { + case 0: + balancer = &rr_balancer; + flags |= BALANCE; + break; + case 1: + igt_assert(intel_gen(intel_get_drm_devid(fd)) >= + 8); + balancer = &qd_balancer; + flags |= SEQNO | BALANCE; + break; + default: + if (!quiet) + fprintf(stderr, + "Unknown balancing mode '%s'!\n", + optarg); + return 1; + } + break; + case 'h': + print_help(); + return 0; + default: + return 1; + } + } + + if (!nop_calibration) { + if (!quiet) + printf("Calibrating nop delay with %u%% tolerance...\n", + tolerance_pct); + nop_calibration = calibrate_nop(tolerance_pct); + if (!quiet) + printf("Nop calibration for %uus delay is %lu.\n", + nop_calibration_us, nop_calibration); + + return 0; + } + + if (!nr_w_args) { + if (!quiet) + fprintf(stderr, "No workload descriptor(s)!\n"); + return 1; + } + + if (nr_w_args > 1 && clients > 1) { + if (!quiet) + fprintf(stderr, + "Cloned clients cannot be combined with multiple workloads!\n"); + return 1; + } + + wrk = calloc(nr_w_args, sizeof(*wrk)); + igt_assert(wrk); + + for (i = 0; i < nr_w_args; i++) { + w_args[i] = load_workload_descriptor(w_args[i]); + if (!w_args[i]) { + if (!quiet) + fprintf(stderr, + "Failed to load workload descriptor %u!\n", + i); + return 1; + } + + wrk[i] = parse_workload(w_args[i]); + if (!wrk[i]) { + if (!quiet) + fprintf(stderr, + "Failed to parse workload %u!\n", i); + return 1; + } + } + + if (!quiet) { + printf("Using %lu nop calibration for %uus delay.\n", + nop_calibration, nop_calibration_us); + if (nr_w_args > 1) + clients = nr_w_args; + printf("%u client%s.\n", clients, clients > 1 ? "s" : ""); + if (flags & SWAPVCS) + printf("Swapping VCS rings between clients.\n"); + } + + if (master_workload >= 0 && clients == 1) + master_workload = -1; + + w = calloc(clients, sizeof(struct workload *)); + igt_assert(w); + + for (i = 0; i < clients; i++) { + unsigned int flags_ = flags; + + w[i] = clone_workload(wrk[nr_w_args > 1 ? i : 0]); + + if (master_workload >= 0) { + int ret = pipe(w[i]->pipe); + + igt_assert(ret == 0); + } + + if (flags & SWAPVCS && i & 1) + flags_ &= ~SWAPVCS; + + prepare_workload(w[i], flags_); + } + + clock_gettime(CLOCK_MONOTONIC, &t_start); + + igt_fork(child, clients) { + int pipe_fd = -1; + bool background = false; + + if (master_workload >= 0) { + close(w[child]->pipe[0]); + if (child != master_workload) { + pipe_fd = w[child]->pipe[1]; + background = true; + } else { + close(w[child]->pipe[1]); + } + } + + run_workload(child, w[child], background, pipe_fd, balancer, + repeat, flags); + } + + if (master_workload >= 0) { + int status = -1; + pid_t pid; + + for (i = 0; i < clients; i++) + close(w[i]->pipe[1]); + + pid = wait(&status); + if (pid >= 0) + igt_child_done(pid); + + for (i = 0; i < clients; i++) + close(w[i]->pipe[0]); + } + + igt_waitchildren(); + + clock_gettime(CLOCK_MONOTONIC, &t_end); + + t = elapsed(&t_start, &t_end); + if (!quiet) + printf("%.3fs elapsed (%.3f workloads/s)\n", + t, clients * repeat / t); + + for (i = 0; i < clients; i++) + fini_workload(w[i]); + free(w); + for (i = 0; i < nr_w_args; i++) + fini_workload(wrk[i]); + free(w_args); + + return 0; +} diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README new file mode 100644 index 000000000000..7aa0694aa834 --- /dev/null +++ b/benchmarks/wsim/README @@ -0,0 +1,56 @@ +Workload descriptor format +========================== + +ctx.engine.duration_us.dependency.wait,... +<uint>.<str>.<uint>[-<uint>].<int <= 0>.<0|1>,... +d|p|s.<uiny>,... + +For duration a range can be given from which a random value will be picked +before every submit. Since this and seqno management requires CPU access to +objects, care needs to be taken in order to ensure the submit queue is deep +enough these operations do not affect the execution speed unless that is +desired. + +Additional workload steps are also supported: + + 'd' - Adds a delay (in microseconds). + 'p' - Adds a delay relative to the start of previous loop so that the each loop + starts execution with a given period. + 's' - Synchronises the pipeline to a batch relative to the step. + 't' - Throttle every n batches + 'q' - Throttle to n max queue depth + +Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS + +Example (leading spaces must not be present in the actual file): +---------------------------------------------------------------- + + 1.VCS1.3000.0.1 + 1.RCS.500-1000.-1.0 + 1.RCS.3700.0.0 + 1.RCS.1000.-2.0 + 1.VCS2.2300.-2.0 + 1.RCS.4700.-1.0 + 1.VCS2.600.-1.1 + p.16000 + +The above workload described in human language works like this: + + 1. A batch is sent to the VCS1 engine which will be executing for 3ms on the + GPU and userspace will wait until it is finished before proceeding. + 2-4. Now three batches are sent to RCS with durations of 0.5-1.5ms (random + duration range), 3.7ms and 1ms respectively. The first batch has a data + dependency on the preceding VCS1 batch, and the last of the group depends + on the first from the group. + 5. Now a 2.3ms batch is sent to VCS2, with a data dependency on the 3.7ms + RCS batch. + 6. This is followed by a 4.7ms RCS batch with a data dependency on the 2.3ms + VCS2 batch. + 7. Then a 0.6ms VCS2 batch is sent depending on the previous RCS one. In the + same step the tool is told to wait for the batch completes before + proceeding. + 8. Finally the tool is told to wait long enough to ensure the next iteration + starts 16ms after the previous one has started. + +When workload descriptors are provided on the command line, commas must be used +instead of new lines. diff --git a/benchmarks/wsim/media_17i7.wsim b/benchmarks/wsim/media_17i7.wsim new file mode 100644 index 000000000000..5f533d8e168b --- /dev/null +++ b/benchmarks/wsim/media_17i7.wsim @@ -0,0 +1,7 @@ +1.VCS1.3000.0.1 +1.RCS.1000.-1.0 +1.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS2.2300.-2.0 +1.RCS.4700.-1.0 +1.VCS2.600.-1.1 diff --git a/benchmarks/wsim/media_load_balance_17i7.wsim b/benchmarks/wsim/media_load_balance_17i7.wsim new file mode 100644 index 000000000000..25a692032eae --- /dev/null +++ b/benchmarks/wsim/media_load_balance_17i7.wsim @@ -0,0 +1,7 @@ +1.VCS.3000.0.1 +1.RCS.1000.-1.0 +1.RCS.3700.0.0 +1.RCS.1000.-2.0 +1.VCS.2300.-2.0 +1.RCS.4700.-1.0 +1.VCS.600.-1.1 diff --git a/benchmarks/wsim/vcs1.wsim b/benchmarks/wsim/vcs1.wsim new file mode 100644 index 000000000000..9d3e682b5ce8 --- /dev/null +++ b/benchmarks/wsim/vcs1.wsim @@ -0,0 +1,26 @@ +t.5 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 +0.VCS1.500-2000.0.0 diff --git a/benchmarks/wsim/vcs_balanced.wsim b/benchmarks/wsim/vcs_balanced.wsim new file mode 100644 index 000000000000..e8958b8f7f43 --- /dev/null +++ b/benchmarks/wsim/vcs_balanced.wsim @@ -0,0 +1,26 @@ +q.5 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 +0.VCS.500-2000.0.0 diff --git a/lib/igt_core.c b/lib/igt_core.c index 403b9423fa9f..9c3b37fe3d63 100644 --- a/lib/igt_core.c +++ b/lib/igt_core.c @@ -1558,6 +1558,32 @@ bool __igt_fork(void) } /** + * igt_child_done: + * + * Lets the IGT core know that one of the children has exited. + */ +void igt_child_done(pid_t pid) +{ + int i = 0; + int found = -1; + + igt_assert(num_test_children > 1); + + for (i = 0; i < num_test_children; i++) { + if (pid == test_children[i]) { + found = i; + break; + } + } + + igt_assert(found >= 0); + + num_test_children--; + for (i = found; i < num_test_children; i++) + test_children[i] = test_children[i + 1]; +} + +/** * igt_waitchildren: * * Wait for all children forked with igt_fork. diff --git a/lib/igt_core.h b/lib/igt_core.h index 51b98d82ef7f..4a125af1d6a5 100644 --- a/lib/igt_core.h +++ b/lib/igt_core.h @@ -688,6 +688,7 @@ bool __igt_fork(void); #define igt_fork(child, num_children) \ for (int child = 0; child < (num_children); child++) \ for (; __igt_fork(); exit(0)) +void igt_child_done(pid_t pid); void igt_waitchildren(void); void igt_waitchildren_timeout(int seconds, const char *reason); -- 2.9.3 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx