An important property for multi-client systems is that each client gets a 'fair' allotment of system time. (Where fairness is at the whim of the context properties, such as priorities.) This test forks N independent clients (albeit they happen to share a single vm), and does an equal amount of work in client and asserts that they take an equal amount of time. Though we have never claimed to have a completely fair scheduler, that is what is expected. v2: igt_assert_f and more commentary; exclude vip from client stats, include range of frame intervals from each individual client v3: Write down what the test actually does! Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Cc: Ramalingam C <ramalingam.c@xxxxxxxxx> --- tests/i915/gem_exec_schedule.c | 960 +++++++++++++++++++++++++++++++++ 1 file changed, 960 insertions(+) diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c index f23d63ac3..3c950b06f 100644 --- a/tests/i915/gem_exec_schedule.c +++ b/tests/i915/gem_exec_schedule.c @@ -29,6 +29,7 @@ #include <sys/poll.h> #include <sys/ioctl.h> #include <sys/mman.h> +#include <sys/resource.h> #include <sys/syscall.h> #include <sched.h> #include <signal.h> @@ -2516,6 +2517,932 @@ static void measure_semaphore_power(int i915) rapl_close(&pkg); } +static int read_timestamp_frequency(int i915) +{ + int value = 0; + drm_i915_getparam_t gp = { + .value = &value, + .param = I915_PARAM_CS_TIMESTAMP_FREQUENCY, + }; + ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp); + return value; +} + +static uint64_t div64_u64_round_up(uint64_t x, uint64_t y) +{ + return (x + y - 1) / y; +} + +static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns) +{ + int f = read_timestamp_frequency(i915); + if (intel_gen(intel_get_drm_devid(i915)) == 11) + f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */ + return div64_u64_round_up(ns * f, NSEC_PER_SEC); +} + +static uint64_t ticks_to_ns(int i915, uint64_t ticks) +{ + return div64_u64_round_up(ticks * NSEC_PER_SEC, + read_timestamp_frequency(i915)); +} + +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags)) + +#define MI_MATH(x) MI_INSTR(0x1a, (x) - 1) +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2)) +/* Opcodes for MI_MATH_INSTR */ +#define MI_MATH_NOOP MI_MATH_INSTR(0x000, 0x0, 0x0) +#define MI_MATH_LOAD(op1, op2) MI_MATH_INSTR(0x080, op1, op2) +#define MI_MATH_LOADINV(op1, op2) MI_MATH_INSTR(0x480, op1, op2) +#define MI_MATH_LOAD0(op1) MI_MATH_INSTR(0x081, op1) +#define MI_MATH_LOAD1(op1) MI_MATH_INSTR(0x481, op1) +#define MI_MATH_ADD MI_MATH_INSTR(0x100, 0x0, 0x0) +#define MI_MATH_SUB MI_MATH_INSTR(0x101, 0x0, 0x0) +#define MI_MATH_AND MI_MATH_INSTR(0x102, 0x0, 0x0) +#define MI_MATH_OR MI_MATH_INSTR(0x103, 0x0, 0x0) +#define MI_MATH_XOR MI_MATH_INSTR(0x104, 0x0, 0x0) +#define MI_MATH_STORE(op1, op2) MI_MATH_INSTR(0x180, op1, op2) +#define MI_MATH_STOREINV(op1, op2) MI_MATH_INSTR(0x580, op1, op2) +/* Registers used as operands in MI_MATH_INSTR */ +#define MI_MATH_REG(x) (x) +#define MI_MATH_REG_SRCA 0x20 +#define MI_MATH_REG_SRCB 0x21 +#define MI_MATH_REG_ACCU 0x31 +#define MI_MATH_REG_ZF 0x32 +#define MI_MATH_REG_CF 0x33 + +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) + +static void delay(int i915, + const struct intel_execution_engine2 *e, + uint32_t handle, + uint64_t addr, + uint64_t ns) +{ + const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8; + const uint32_t base = gem_engine_mmio_base(i915, e->name); +#define CS_GPR(x) (base + 0x600 + 8 * (x)) +#define RUNTIME (base + 0x3a8) + enum { START_TS, NOW_TS }; + uint32_t *map, *cs, *jmp; + + igt_require(base); + + /* Loop until CTX_TIMESTAMP - initial > @ns */ + + cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE); + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(START_TS) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG; + *cs++ = RUNTIME; + *cs++ = CS_GPR(START_TS); + + while (offset_in_page(cs) & 63) + *cs++ = 0; + jmp = cs; + + *cs++ = 0x5 << 23; /* MI_ARB_CHECK */ + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(NOW_TS) + 4; + *cs++ = 0; + *cs++ = MI_LOAD_REGISTER_REG; + *cs++ = RUNTIME; + *cs++ = CS_GPR(NOW_TS); + + /* delta = now - start; inverted to match COND_BBE */ + *cs++ = MI_MATH(4); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS)); + *cs++ = MI_MATH_SUB; + *cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU); + + /* Save delta for reading by COND_BBE */ + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = CS_GPR(NOW_TS); + *cs++ = addr + 4000; + *cs++ = addr >> 32; + + /* Delay between SRM and COND_BBE to post the writes */ + for (int n = 0; n < 8; n++) { + *cs++ = MI_STORE_DWORD_IMM; + if (use_64b) { + *cs++ = addr + 4064; + *cs++ = addr >> 32; + } else { + *cs++ = 0; + *cs++ = addr + 4064; + } + *cs++ = 0; + } + + /* Break if delta [time elapsed] > ns */ + *cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b); + *cs++ = ~ns_to_ctx_ticks(i915, ns); + *cs++ = addr + 4000; + *cs++ = addr >> 32; + + /* Otherwise back to recalculating delta */ + *cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b; + *cs++ = addr + offset_in_page(jmp); + *cs++ = addr >> 32; + + munmap(map, 4096); +} + +static struct drm_i915_gem_exec_object2 +delay_create(int i915, uint32_t ctx, + const struct intel_execution_engine2 *e, + uint64_t target_ns) +{ + struct drm_i915_gem_exec_object2 obj = { + .handle = batch_create(i915), + .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS, + }; + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = to_user_pointer(&obj), + .buffer_count = 1, + .rsvd1 = ctx, + .flags = e->flags, + }; + + obj.offset = obj.handle << 12; + gem_execbuf(i915, &execbuf); + gem_sync(i915, obj.handle); + + delay(i915, e, obj.handle, obj.offset, target_ns); + + obj.flags |= EXEC_OBJECT_PINNED; + return obj; +} + +static void tslog(int i915, + const struct intel_execution_engine2 *e, + uint32_t handle, + uint64_t addr) +{ + const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8; + const uint32_t base = gem_engine_mmio_base(i915, e->name); +#define CS_GPR(x) (base + 0x600 + 8 * (x)) +#define CS_TIMESTAMP (base + 0x358) + enum { INC, MASK, ADDR }; + uint32_t *timestamp_lo, *addr_lo; + uint32_t *map, *cs; + + igt_require(base); + + map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE); + cs = map + 512; + + /* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */ + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = CS_TIMESTAMP; + timestamp_lo = cs; + *cs++ = addr; + *cs++ = addr >> 32; + + /* Load the address + inc & mask variables */ + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(ADDR); + addr_lo = cs; + *cs++ = addr; + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(ADDR) + 4; + *cs++ = addr >> 32; + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(INC); + *cs++ = 4; + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(INC) + 4; + *cs++ = 0; + + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(MASK); + *cs++ = 0xfffff7ff; + *cs++ = MI_LOAD_REGISTER_IMM; + *cs++ = CS_GPR(MASK) + 4; + *cs++ = 0xffffffff; + + /* Increment the [ring] address for saving CS_TIMESTAMP */ + *cs++ = MI_MATH(8); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR)); + *cs++ = MI_MATH_ADD; + *cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR)); + *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK)); + *cs++ = MI_MATH_AND; + *cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU); + + /* Rewrite the batch buffer for the next execution */ + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = CS_GPR(ADDR); + *cs++ = addr + offset_in_page(timestamp_lo); + *cs++ = addr >> 32; + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = CS_GPR(ADDR); + *cs++ = addr + offset_in_page(addr_lo); + *cs++ = addr >> 32; + + *cs++ = MI_BATCH_BUFFER_END; + + munmap(map, 4096); +} + +static struct drm_i915_gem_exec_object2 +tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e) +{ + struct drm_i915_gem_exec_object2 obj = { + .handle = batch_create(i915), + .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS, + }; + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = to_user_pointer(&obj), + .buffer_count = 1, + .rsvd1 = ctx, + .flags = e->flags, + }; + + obj.offset = obj.handle << 12; + gem_execbuf(i915, &execbuf); + gem_sync(i915, obj.handle); + + tslog(i915, e, obj.handle, obj.offset); + + obj.flags |= EXEC_OBJECT_PINNED; + return obj; +} + +static int cmp_u32(const void *A, const void *B) +{ + const uint32_t *a = A, *b = B; + + if (*a < *b) + return -1; + else if (*a > *b) + return 1; + else + return 0; +} + +static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e) +{ + const int gen = intel_gen(intel_get_drm_devid(i915)); + + if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO) + return false; /* looks fubar */ + + return true; +} + +static struct intel_execution_engine2 +pick_random_engine(int i915, const struct intel_execution_engine2 *not) +{ + const struct intel_execution_engine2 *e; + unsigned int count = 0; + + __for_each_physical_engine(i915, e) { + if (e->flags == not->flags) + continue; + if (!gem_class_has_mutable_submission(i915, e->class)) + continue; + count++; + } + if (!count) + return *not; + + count = rand() % count; + __for_each_physical_engine(i915, e) { + if (e->flags == not->flags) + continue; + if (!gem_class_has_mutable_submission(i915, e->class)) + continue; + if (!count--) + break; + } + + return *e; +} + +static void fair_child(int i915, uint32_t ctx, + const struct intel_execution_engine2 *e, + uint64_t frame_ns, + int timeline, + uint32_t common, + unsigned int flags, + unsigned long *ctl, + unsigned long *median, + unsigned long *iqr) +#define F_SYNC (1 << 0) +#define F_PACE (1 << 1) +#define F_FLOW (1 << 2) +#define F_HALF (1 << 3) +#define F_SOLO (1 << 4) +#define F_SPARE (1 << 5) +#define F_NEXT (1 << 6) +#define F_VIP (1 << 7) +#define F_RRUL (1 << 8) +#define F_SHARE (1 << 9) +#define F_PING (1 << 10) +#define F_THROTTLE (1 << 11) +#define F_ISOLATE (1 << 12) +{ + const int batches_per_frame = flags & F_SOLO ? 1 : 3; + struct drm_i915_gem_exec_object2 obj[4] = { + {}, + { + .handle = common ?: gem_create(i915, 4096), + }, + delay_create(i915, ctx, e, frame_ns / batches_per_frame), + delay_create(i915, ctx, e, frame_ns / batches_per_frame), + }; + struct intel_execution_engine2 ping = *e; + int p_fence = -1, n_fence = -1; + unsigned long count = 0; + int n; + + srandom(getpid()); + if (flags & F_PING) + ping = pick_random_engine(i915, e); + obj[0] = tslog_create(i915, ctx, &ping); + + while (!READ_ONCE(*ctl)) { + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = to_user_pointer(obj), + .buffer_count = 3, + .rsvd1 = ctx, + .rsvd2 = -1, + .flags = e->flags, + }; + + if (flags & F_FLOW) { + unsigned int seq; + + seq = count; + if (flags & F_NEXT) + seq++; + + execbuf.rsvd2 = + sw_sync_timeline_create_fence(timeline, seq); + execbuf.flags |= I915_EXEC_FENCE_IN; + } + + execbuf.flags |= I915_EXEC_FENCE_OUT; + gem_execbuf_wr(i915, &execbuf); + n_fence = execbuf.rsvd2 >> 32; + execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN); + for (n = 1; n < batches_per_frame; n++) + gem_execbuf(i915, &execbuf); + close(execbuf.rsvd2); + + execbuf.buffer_count = 1; + execbuf.batch_start_offset = 2048; + execbuf.flags = ping.flags | I915_EXEC_FENCE_IN; + execbuf.rsvd2 = n_fence; + gem_execbuf(i915, &execbuf); + + if (flags & F_PACE && p_fence != -1) { + struct pollfd pfd = { + .fd = p_fence, + .events = POLLIN, + }; + poll(&pfd, 1, -1); + } + close(p_fence); + + if (flags & F_SYNC) { + struct pollfd pfd = { + .fd = n_fence, + .events = POLLIN, + }; + poll(&pfd, 1, -1); + } + + if (flags & F_THROTTLE) + igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0); + + igt_swap(obj[2], obj[3]); + igt_swap(p_fence, n_fence); + count++; + } + close(p_fence); + + gem_close(i915, obj[3].handle); + gem_close(i915, obj[2].handle); + if (obj[1].handle != common) + gem_close(i915, obj[1].handle); + + gem_sync(i915, obj[0].handle); + if (median) { + uint32_t *map; + + /* + * We recorded the CS_TIMESTAMP of each frame, and if + * the GPU is being shared completely fairly, we expect + * each frame to be at the same interval from the last. + * + * Compute the interval between frames and report back + * both the median interval and the range for this client. + */ + + map = gem_mmap__device_coherent(i915, obj[0].handle, + 0, 4096, PROT_WRITE); + for (n = 1; n < min(count, 512); n++) { + igt_assert(map[n]); + map[n - 1] = map[n] - map[n - 1]; + } + qsort(map, --n, sizeof(*map), cmp_u32); + *iqr = ticks_to_ns(i915, map[(3 * n + 3) / 4] - map[n / 4]); + *median = ticks_to_ns(i915, map[n / 2]); + munmap(map, 4096); + } + gem_close(i915, obj[0].handle); +} + +static int cmp_ul(const void *A, const void *B) +{ + const unsigned long *a = A, *b = B; + + if (*a < *b) + return -1; + else if (*a > *b) + return 1; + else + return 0; +} + +static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b) +{ + uint64_t cpu_time = 0; + + cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC; + cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000; + + cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC; + cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000; + + return cpu_time; +} + +static void timeline_advance(int timeline, int delay_ns) +{ + struct timespec tv = { .tv_nsec = delay_ns }; + nanosleep(&tv, NULL); + sw_sync_timeline_inc(timeline, 1); +} + +static void fairness(int i915, + const struct intel_execution_engine2 *e, + int timeout, unsigned int flags) +{ + const int frame_ns = 16666 * 1000; + const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns; + unsigned long *result, *iqr; + uint32_t common = 0; + + igt_require(has_ctx_timestamp(i915, e)); + igt_require(gem_class_has_mutable_submission(i915, e->class)); + + if (flags & F_SHARE) + common = gem_create(i915, 4095); + + result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + igt_assert(result != MAP_FAILED); + iqr = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + igt_assert(iqr != MAP_FAILED); + + /* + * The combined workload always runs at a 60fps target (unless F_HALF!). + * This gives a frame of interval of 16ms that is evenly split across + * all the clients, so simulating a system with a bunch of clients that + * are perfectly balanced and can sustain 60fps. Our job is to ensure + * that each client does run at a smooth 60fps. + * + * Each client runs a fixed length delay loop (as a single request, + * or split into 3) and then records the CS_TIMESTAMP after completing + * its delay. Given a fair allotment of GPU time to each client, + * that timestamp will [ideally] be at a precise 16ms intervals. + * In practice, time is wasted on context switches, so as the number + * of clients increases, the proprotion of time spent on context + * switches grows. As we get to 64 render clients, we will be spending + * as much time in context switches as executing the client workloads. + * + * Each client frame may be paced by some throttling technique found + * in the wild. i.e. each client may wait until a simulated vblank + * to indicate the start of a new frame, or it may wait until the + * completion of a previous frame. This causes submission from each + * client and across the system to be chunky and uneven. + * + * We look at the variation of frame intervals within each client, and + * the variation of the medians across the clients to see if the + * distribution (budget) of GPU time was fair enough. + * + * Alternative (and important) metrics will be more latency centric; + * looking at how well we can sustain meeting deadline given competition + * by clients for the GPU. + */ + + for (int n = 2; n <= 256; n <<= 1) { /* 32 == 500us per client */ + int timeline = sw_sync_timeline_create(); + int nfences = timeout * NSEC_PER_SEC / fence_ns + 1; + int nchild = n - 1; /* odd for easy medians */ + const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE)); + const int lo = nchild / 4; + const int hi = (3 * nchild + 3) / 4 - 1; + struct rusage old_usage, usage; + uint64_t cpu_time, d_time; + struct timespec tv; + struct igt_mean m; + + memset(result, 0, (nchild + 1) * sizeof(result[0])); + + if (flags & F_PING) { /* fill the others with light bg load */ + struct intel_execution_engine2 *ping; + + __for_each_physical_engine(i915, ping) { + if (ping->flags == e->flags) + continue; + + igt_fork(child, 1) { + uint32_t ctx = gem_context_clone_with_engines(i915, 0); + + fair_child(i915, ctx, ping, + child_ns / 8, + -1, common, + F_SOLO | F_PACE | F_SHARE, + &result[nchild], + NULL, NULL); + + gem_context_destroy(i915, ctx); + } + } + } + + getrusage(RUSAGE_CHILDREN, &old_usage); + igt_nsec_elapsed(memset(&tv, 0, sizeof(tv))); + igt_fork(child, nchild) { + uint32_t ctx; + + if (flags & F_ISOLATE) { + int clone, dmabuf = -1; + + if (common) + dmabuf = prime_handle_to_fd(i915, common); + + clone = gem_reopen_driver(i915); + gem_context_copy_engines(i915, 0, clone, 0); + i915 = clone; + + if (dmabuf != -1) + common = prime_fd_to_handle(i915, dmabuf); + } + + ctx = gem_context_clone_with_engines(i915, 0); + + if (flags & F_VIP && child == 0) { + gem_context_set_priority(i915, ctx, MAX_PRIO); + flags |= F_FLOW; + } + if (flags & F_RRUL && child == 0) + flags |= F_SOLO | F_FLOW | F_SYNC; + + fair_child(i915, ctx, e, child_ns, + timeline, common, flags, + &result[nchild], + &result[child], &iqr[child]); + + gem_context_destroy(i915, ctx); + } + + while (nfences--) + timeline_advance(timeline, fence_ns); + + result[nchild] = 1; + for (int child = 0; child < nchild; child++) { + while (!READ_ONCE(result[child])) + timeline_advance(timeline, fence_ns); + } + + igt_waitchildren(); + close(timeline); + + /* + * Are we running out of CPU time, and fail to submit frames? + * + * We try to rule out any undue impact on the GPU scheduling + * from the CPU scheduler by looking for core saturation. If + * we may be in a situation where the clients + kernel are + * taking a whole core (think lockdep), then it is increasingly + * likely that our measurements include delays from the CPU + * scheduler. Err on the side of caution. + */ + d_time = igt_nsec_elapsed(&tv); + getrusage(RUSAGE_CHILDREN, &usage); + cpu_time = d_cpu_time(&usage, &old_usage); + igt_debug("CPU usage: %.0f%%\n", 100. * cpu_time / d_time); + if (4 * cpu_time > 3 * d_time) { + if (nchild > 7) /* good enough to judge pass/fail */ + break; + + igt_skip_on_f(4 * cpu_time > 3 * d_time, + "%.0f%% CPU usage, presuming capacity exceeded\n", + 100. * cpu_time / d_time); + } + + /* With no contention, we should match our target frametime */ + if (nchild == 1) { + igt_assert(4 * result[0] > 3 * fence_ns && + 3 * result[0] < 4 * fence_ns); + continue; + } + + /* + * The VIP should always be able to hit the target frame rate; + * regardless of budget contention from lessor clients. + */ + if (flags & (F_VIP | F_RRUL)) { + igt_info("VIP interval %.2fms, range %.2fms\n", + 1e-6 * result[0], 1e-6 * iqr[0]); + igt_assert_f(4 * result[0] > 3 * fence_ns && + 3 * result[0] < 4 * fence_ns, + "VIP expects to run exactly when it wants, expects an interval of %.2fms, was %.2fms\n", + 1e-6 * fence_ns, 1e-6 * result[0]); + igt_assert_f(2 * iqr[0] < result[0], + "VIP frame IQR %.2fms exceeded median threshold %.2fms\n", + 1e-6 * iqr[0], + 1e-6 * result[0] / 2); + if (!--nchild) + continue; + + /* Exclude the VIP result from the plebian statistics */ + memmove(result, result + 1, nchild * sizeof(*result)); + memmove(iqr, iqr + 1, nchild * sizeof(*iqr)); + } + + igt_mean_init(&m); + for (int child = 0; child < nchild; child++) + igt_mean_add(&m, result[child]); + + qsort(result, nchild, sizeof(*result), cmp_ul); + qsort(iqr, nchild, sizeof(*iqr), cmp_ul); + + /* + * The target interval for median/mean is 16ms (fence_ns). + * However, this work is evenly split across the clients so + * the range (and median) of client medians may be much less + * than 16ms [16/3N]. We present median of medians to try + * and avoid any instability while running in CI; at the cost + * of insensitivity! + */ + igt_info("%3d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f [%.1f, %.1f], mean: %.1f ± %.2f ms, cpu: %.0f%%\n", + nchild, + 1e-6 * result[0], 1e-6 * result[nchild - 1], + 1e-6 * result[lo], 1e-6 * result[hi], + 1e-6 * result[nchild / 2], + 1e-6 * iqr[lo], 1e-6 * iqr[hi], + 1e-6 * igt_mean_get(&m), + 1e-6 * sqrt(igt_mean_get_variance(&m)), + 100. * cpu_time / d_time); + + igt_assert_f(iqr[nchild / 2] < 2 * result[nchild / 2], + "Child frame IQR %.2fms exceeded median threshold %.2fms\n", + 1e-6 * iqr[nchild / 2], + 1e-6 * result[nchild / 2] * 2); + + igt_assert_f(4 * igt_mean_get(&m) > 3 * result[nchild / 2] && + 3 * igt_mean_get(&m) < 4 * result[nchild / 2], + "Mean of client interval %.2fms differs from median %.2fms, distribution is skewed\n", + + 1e-6 * igt_mean_get(&m), 1e-6 * result[nchild / 2]); + + igt_assert_f(result[nchild / 2] > frame_ns / 2, + "Median client interval %.2fms did not match target interval %.2fms\n", + 1e-6 * result[nchild / 2], 1e-6 * frame_ns); + + + igt_assert_f(result[hi] - result[lo] < result[nchild / 2], + "Interquartile range of client intervals %.2fms is as large as the median threshold %.2fms, clients are not evenly distributed!\n", + 1e-6 * (result[hi] - result[lo]), + 1e-6 * result[nchild / 2]); + + /* May be slowed due to sheer volume of context switches */ + if (result[0] > 2 * fence_ns) + break; + } + + munmap(iqr, 4096); + munmap(result, 4096); + if (common) + gem_close(i915, common); +} + +static void test_fairness(int i915, int timeout) +{ + static const struct { + const char *name; + unsigned int flags; + } fair[] = { + /* + * none - maximal greed in each client + * + * Push as many frames from each client as fast as possible + */ + { "none", 0 }, + { "none-vip", F_VIP }, /* one vip client must meet deadlines */ + { "none-solo", F_SOLO }, /* 1 batch per frame per client */ + { "none-share", F_SHARE }, /* read from a common buffer */ + { "none-rrul", F_RRUL }, /* "realtime-response under load" */ + { "none-ping", F_PING }, /* measure inter-engine fairness */ + + /* + * throttle - original per client throttling + * + * Used for front buffering rendering where there is no + * extenal frame marker. Each client tries to only keep + * 20ms of work submitted, though that measurement is + * flawed... + * + * This is used by Xorg to try and maintain some resembalance + * of input/output consistency when being feed a continuous + * stream of X11 draw requests straight into scanout, where + * the clients may submit the work faster than can be drawn. + * + * Throttling tracks requests per-file (and assumes that + * all requests are in submission order across the whole file), + * so we split each child to its own fd. + */ + { "throttle", F_THROTTLE | F_ISOLATE }, + { "throttle-vip", F_THROTTLE | F_ISOLATE | F_VIP }, + { "throttle-solo", F_THROTTLE | F_ISOLATE | F_SOLO }, + { "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE }, + { "throttle-rrul", F_THROTTLE | F_ISOLATE | F_RRUL }, + + /* + * pace - mesa "submit double buffering" + * + * Submit a frame, wait for previous frame to start. This + * prevents each client from getting too far ahead of its + * rendering, maintaining a consistent input/output latency. + */ + { "pace", F_PACE }, + { "pace-solo", F_PACE | F_SOLO }, + { "pace-share", F_PACE | F_SOLO | F_SHARE }, + { "pace-ping", F_PACE | F_SOLO | F_SHARE | F_PING}, + + /* sync - only submit a frame at a time */ + { "sync", F_SYNC }, + { "sync-vip", F_SYNC | F_VIP }, + { "sync-solo", F_SYNC | F_SOLO }, + + /* flow - synchronise execution against the clock (vblank) */ + { "flow", F_PACE | F_FLOW }, + { "flow-solo", F_PACE | F_FLOW | F_SOLO }, + { "flow-share", F_PACE | F_FLOW | F_SHARE }, + { "flow-ping", F_PACE | F_FLOW | F_SHARE | F_PING }, + + /* next - submit ahead of the clock (vblank double buffering) */ + { "next", F_PACE | F_FLOW | F_NEXT }, + { "next-solo", F_PACE | F_FLOW | F_NEXT | F_SOLO }, + { "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE }, + { "next-ping", F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING }, + + /* spare - underutilise by a single client timeslice */ + { "spare", F_PACE | F_FLOW | F_SPARE }, + { "spare-solo", F_PACE | F_FLOW | F_SPARE | F_SOLO }, + + /* half - run at half pace (submit 16ms of work every 32ms) */ + { "half", F_PACE | F_FLOW | F_HALF }, + { "half-solo", F_PACE | F_FLOW | F_HALF | F_SOLO }, + + {} + }; + + igt_fixture { + igt_info("CS timestamp frequency: %d\n", + read_timestamp_frequency(i915)); + + igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8); + } + + for (typeof(*fair) *f = fair; f->name; f++) { + igt_subtest_with_dynamic_f("fair-%s", f->name) { + const struct intel_execution_engine2 *e; + + __for_each_physical_engine(i915, e) { + if (!gem_class_can_store_dword(i915, e->class)) + continue; + + igt_dynamic_f("%s", e->name) + fairness(i915, e, timeout, f->flags); + } + } + } +} + +static uint32_t read_ctx_timestamp(int i915, + uint32_t ctx, + const struct intel_execution_engine2 *e) +{ + const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8; + const uint32_t base = gem_engine_mmio_base(i915, e->name); + struct drm_i915_gem_relocation_entry reloc; + struct drm_i915_gem_exec_object2 obj = { + .handle = gem_create(i915, 4096), + .offset = 32 << 20, + .relocs_ptr = to_user_pointer(&reloc), + .relocation_count = 1, + }; + struct drm_i915_gem_execbuffer2 execbuf = { + .buffers_ptr = to_user_pointer(&obj), + .buffer_count = 1, + .flags = e->flags, + .rsvd1 = ctx, + }; +#define RUNTIME (base + 0x3a8) + uint32_t *map, *cs; + uint32_t ts; + + igt_require(base); + + cs = map = gem_mmap__device_coherent(i915, obj.handle, + 0, 4096, PROT_WRITE); + + *cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */ + *cs++ = RUNTIME; + memset(&reloc, 0, sizeof(reloc)); + reloc.target_handle = obj.handle; + reloc.presumed_offset = obj.offset; + reloc.offset = offset_in_page(cs); + reloc.delta = 4000; + *cs++ = obj.offset + 4000; + *cs++ = obj.offset >> 32; + + *cs++ = MI_BATCH_BUFFER_END; + + gem_execbuf(i915, &execbuf); + gem_sync(i915, obj.handle); + gem_close(i915, obj.handle); + + ts = map[1000]; + munmap(map, 4096); + + return ts; +} + +static void fairslice(int i915, + const struct intel_execution_engine2 *e, + unsigned long flags) +{ + igt_spin_t *spin = NULL; + uint32_t ctx[3]; + uint32_t ts[3]; + + for (int i = 0; i < ARRAY_SIZE(ctx); i++) { + ctx[i] = gem_context_clone_with_engines(i915, 0); + if (spin == NULL) { + spin = __igt_spin_new(i915, + .ctx = ctx[i], + .engine = e->flags, + .flags = flags); + } else { + struct drm_i915_gem_execbuffer2 eb = { + .buffer_count = 1, + .buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]), + .flags = e->flags, + .rsvd1 = ctx[i], + }; + gem_execbuf(i915, &eb); + } + } + + sleep(2); /* over the course of many timeslices */ + + igt_assert(gem_bo_busy(i915, spin->handle)); + igt_spin_end(spin); + for (int i = 0; i < ARRAY_SIZE(ctx); i++) + ts[i] = read_ctx_timestamp(i915, ctx[i], e); + + for (int i = 0; i < ARRAY_SIZE(ctx); i++) + gem_context_destroy(i915, ctx[i]); + igt_spin_free(i915, spin); + + qsort(ts, 3, sizeof(*ts), cmp_u32); + igt_info("%s: [%.1f, %.1f, %.1f] ms\n", e->name, + 1e-6 * ticks_to_ns(i915, ts[0]), + 1e-6 * ticks_to_ns(i915, ts[1]), + 1e-6 * ticks_to_ns(i915, ts[2])); + + igt_assert_f(ts[2], "CTX_TIMESTAMP not reported!\n"); + igt_assert_f((ts[2] - ts[0]) * 6 < ts[1], + "Range of timeslices greater than tolerable: %.2fms > %.2fms; unfair!\n", + 1e-6 * ticks_to_ns(i915, ts[2] - ts[0]), + 1e-6 * ticks_to_ns(i915, ts[1]) / 6); +} + #define test_each_engine(T, i915, e) \ igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \ igt_dynamic_f("%s", e->name) @@ -2582,6 +3509,35 @@ igt_main test_each_engine("lateslice", fd, e) lateslice(fd, e->flags); + igt_subtest_group { + igt_fixture { + igt_require(gem_scheduler_has_semaphores(fd)); + igt_require(gem_scheduler_has_preemption(fd)); + igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8); + } + + test_each_engine("fairslice", fd, e) + fairslice(fd, e, 0); + + test_each_engine("u-fairslice", fd, e) + fairslice(fd, e, IGT_SPIN_USERPTR); + + igt_subtest("fairslice-all") { + __for_each_physical_engine(fd, e) { + igt_fork(child, 1) + fairslice(fd, e, 0); + } + igt_waitchildren(); + } + igt_subtest("u-fairslice-all") { + __for_each_physical_engine(fd, e) { + igt_fork(child, 1) + fairslice(fd, e, IGT_SPIN_USERPTR); + } + igt_waitchildren(); + } + } + test_each_engine("submit-early-slice", fd, e) submit_slice(fd, e, EARLY_SUBMIT); test_each_engine("submit-golden-slice", fd, e) @@ -2610,6 +3566,10 @@ igt_main test_each_engine_store("promotion", fd, e) promotion(fd, e->flags); + igt_subtest_group { + test_fairness(fd, 2); + } + igt_subtest_group { igt_fixture { igt_require(gem_scheduler_has_preemption(fd)); -- 2.29.2 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx