On Mon, Dec 13, 2021 at 03:29:10PM -0800, John.C.Harrison@xxxxxxxxx wrote: > From: John Harrison <John.C.Harrison@xxxxxxxxx> > > Added a an extra step to the i915_hangman tests to check that the > system is still alive after the hang and recovery. This submits a > simple batch to each engine which does a write to memory and checks > that the write occurred. > > Signed-off-by: John Harrison <John.C.Harrison@xxxxxxxxx> > --- > tests/i915/i915_hangman.c | 115 ++++++++++++++++++++++++++++++++++++++ > 1 file changed, 115 insertions(+) > > diff --git a/tests/i915/i915_hangman.c b/tests/i915/i915_hangman.c > index b77705206..20653b479 100644 > --- a/tests/i915/i915_hangman.c > +++ b/tests/i915/i915_hangman.c > @@ -47,8 +47,113 @@ > static int device = -1; > static int sysfs = -1; > > +#define OFFSET_ALIVE 10 > + > IGT_TEST_DESCRIPTION("Tests for hang detection and recovery"); > > +/* Requires master for STORE_DWORD on gen4/5 */ > +static void store(int fd, const struct intel_execution_engine2 *e, > + int fence, uint32_t target, unsigned offset_value) > +{ > + const int SCRATCH = 0; > + const int BATCH = 1; > + const int gen = intel_gen(intel_get_drm_devid(fd)); > + struct drm_i915_gem_exec_object2 obj[2]; > + struct drm_i915_gem_relocation_entry reloc; > + struct drm_i915_gem_execbuffer2 execbuf; > + uint32_t batch[16]; > + int i; > + > + memset(&execbuf, 0, sizeof(execbuf)); > + execbuf.buffers_ptr = to_user_pointer(obj); > + execbuf.buffer_count = ARRAY_SIZE(obj); > + execbuf.flags = e->flags; > + if (fence != -1) { > + execbuf.flags |= I915_EXEC_FENCE_IN; > + execbuf.rsvd2 = fence; > + } > + if (gen < 6) > + execbuf.flags |= I915_EXEC_SECURE; > + > + memset(obj, 0, sizeof(obj)); > + obj[SCRATCH].handle = target; > + > + obj[BATCH].handle = gem_create(fd, 4096); > + obj[BATCH].relocs_ptr = to_user_pointer(&reloc); > + obj[BATCH].relocation_count = 1; > + memset(&reloc, 0, sizeof(reloc)); > + > + i = 0; > + reloc.target_handle = obj[SCRATCH].handle; > + reloc.presumed_offset = -1; > + reloc.offset = sizeof(uint32_t) * (i + 1); > + reloc.delta = sizeof(uint32_t) * offset_value; > + reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION; > + reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION; > + batch[i] = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0); > + if (gen >= 8) { > + batch[++i] = reloc.delta; > + batch[++i] = 0; > + } else if (gen >= 4) { > + batch[++i] = 0; > + batch[++i] = reloc.delta; > + reloc.offset += sizeof(uint32_t); > + } else { > + batch[i]--; > + batch[++i] = reloc.delta; > + } > + batch[++i] = offset_value; > + batch[++i] = MI_BATCH_BUFFER_END; > + gem_write(fd, obj[BATCH].handle, 0, batch, sizeof(batch)); > + gem_execbuf(fd, &execbuf); > + gem_close(fd, obj[BATCH].handle); > +} > + > +static void check_alive(void) > +{ > + const struct intel_execution_engine2 *engine; > + const intel_ctx_t *ctx; > + uint32_t scratch, *out; > + int fd, i = 0; > + uint64_t ahnd; > + > + fd = drm_open_driver(DRIVER_INTEL); > + igt_require(gem_class_can_store_dword(fd, 0)); > + > + ctx = intel_ctx_create_all_physical(fd); > + ahnd = get_reloc_ahnd(fd, ctx->id); > + scratch = gem_create(fd, 4096); > + out = gem_mmap__wc(fd, scratch, 0, 4096, PROT_WRITE); > + gem_set_domain(fd, scratch, > + I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); > + > + for_each_physical_engine(fd, engine) { > + igt_assert_eq_u32(out[i + OFFSET_ALIVE], 0); > + i++; > + } > + > + i = 0; > + for_each_ctx_engine(fd, ctx, engine) { > + if (!gem_class_can_store_dword(fd, engine->class)) > + continue; > + > + /* +OFFSET_ALIVE to ensure engine zero doesn't get a false negative */ > + store(fd, engine, -1, scratch, i + OFFSET_ALIVE); You need to pass ctx + ahnd to store() to add softpin path. Relocs won't work above Tigerlake. -- Zbigniew > + i++; > + } > + > + gem_set_domain(fd, scratch, I915_GEM_DOMAIN_GTT, 0); > + > + while (i--) > + igt_assert_eq_u32(out[i + OFFSET_ALIVE], i + OFFSET_ALIVE); > + > + munmap(out, 4096); > + gem_close(fd, scratch); > + put_ahnd(ahnd); > + intel_ctx_destroy(fd, ctx); > + close(fd); > +} > + > static bool has_error_state(int dir) > { > bool result; > @@ -230,6 +335,8 @@ static void test_error_state_capture(const intel_ctx_t *ctx, > check_error_state(e->name, offset, batch); > munmap(batch, 4096); > put_ahnd(ahnd); > + > + check_alive(); > } > > static void > @@ -288,6 +395,8 @@ test_engine_hang(const intel_ctx_t *ctx, > put_ahnd(ahndN); > } > put_ahnd(ahnd); > + > + check_alive(); > } > > static int hang_count; > @@ -320,6 +429,8 @@ static void test_hang_detector(const intel_ctx_t *ctx, > > /* Did it work? */ > igt_assert(hang_count == 1); > + > + check_alive(); > } > > /* This test covers the case where we end up in an uninitialised area of the > @@ -355,6 +466,8 @@ static void hangcheck_unterminated(const intel_ctx_t *ctx) > igt_force_gpu_reset(device); > igt_assert_f(0, "unterminated batch did not trigger a hang!"); > } > + > + check_alive(); > } > > static void do_tests(const char *name, const char *prefix, > @@ -432,6 +545,8 @@ igt_main > igt_assert(sysfs != -1); > > igt_require(has_error_state(sysfs)); > + > + gem_require_mmap_wc(device); > } > > igt_describe("Basic error capture"); > -- > 2.25.1 >