Re: [PATCH i-g-t 8/8] tests/i915/gem_exec_capture: Update to support GuC based resets

Matthew Brost <matthew.brost@xxxxxxxxx> · Thu, 28 Oct 2021 19:54:28 -0700



On Thu, Oct 21, 2021 at 04:40:44PM -0700, John.C.Harrison@xxxxxxxxx wrote:
> From: John Harrison <John.C.Harrison@xxxxxxxxx>
> 
> When GuC submission is enabled, GuC itself manages hang detection and
> recovery. Therefore, any test that relies on being able to trigger an
> engine reset in the driver will fail. Full GT resets can still be
> triggered by the driver. However, in that situation detecting the
> specific context that caused a hang is not possible as the driver has
> no information about what is actually running on the hardware at any
> given time. Plus of course, there was no context that caused the hang
> because the hang was triggered manually, so it's basically a bogus
> mechanism in the first place!
> 
> Update the capture test to cause a reset via a the hangcheck mechanism
> by submitting a hanging batch and waiting. That way it is guaranteed to
> be testing the correct reset code paths for the current platform,
> whether that is GuC enabled or not.
> 
> Signed-off-by: John Harrison <John.C.Harrison@xxxxxxxxx>

Reviewed-by: Matthew Brost <matthew.brost@xxxxxxxxx>

> ---
>  tests/i915/gem_exec_capture.c | 65 ++++++++++++++++++++++++++++-------
>  1 file changed, 53 insertions(+), 12 deletions(-)
> 
> diff --git a/tests/i915/gem_exec_capture.c b/tests/i915/gem_exec_capture.c
> index 8997125ee..dda6e6a8f 100644
> --- a/tests/i915/gem_exec_capture.c
> +++ b/tests/i915/gem_exec_capture.c
> @@ -23,6 +23,7 @@
>  
>  #include <sys/poll.h>
>  #include <zlib.h>
> +#include <sched.h>
>  
>  #include "i915/gem.h"
>  #include "i915/gem_create.h"
> @@ -31,6 +32,8 @@
>  #include "igt_rand.h"
>  #include "igt_sysfs.h"
>  
> +#define MAX_RESET_TIME	600
> +
>  IGT_TEST_DESCRIPTION("Check that we capture the user specified objects on a hang");
>  
>  struct offset {
> @@ -213,7 +216,29 @@ static void configure_hangs(int fd, const struct intel_execution_engine2 *e, int
>  	gem_engine_property_printf(fd, e->name, "heartbeat_interval_ms", "%d", 500);
>  
>  	/* Allow engine based resets and disable banning */
> -	igt_allow_hang(fd, ctxt_id, HANG_ALLOW_CAPTURE);
> +	igt_allow_hang(fd, ctxt_id, HANG_ALLOW_CAPTURE | HANG_WANT_ENGINE_RESET);
> +}
> +
> +static bool fence_busy(int fence)
> +{
> +	return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
> +}
> +
> +static void wait_to_die(int fence_out)
> +{
> +	struct timeval before, after, delta;
> +
> +	/* Wait for a reset to occur */
> +	gettimeofday(&before, NULL);
> +	while (fence_busy(fence_out)) {
> +		gettimeofday(&after, NULL);
> +		timersub(&after, &before, &delta);
> +		igt_assert(delta.tv_sec < MAX_RESET_TIME);
> +		sched_yield();
> +	}
> +	gettimeofday(&after, NULL);
> +	timersub(&after, &before, &delta);
> +	igt_info("Target died after %ld.%06lds\n", delta.tv_sec, delta.tv_usec);
>  }
>  
>  static void __capture1(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
> @@ -230,7 +255,7 @@ static void __capture1(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
>  	struct drm_i915_gem_execbuffer2 execbuf;
>  	uint32_t *batch, *seqno;
>  	struct offset offset;
> -	int i;
> +	int i, fence_out;
>  
>  	configure_hangs(fd, e, ctx->id);
>  
> @@ -315,18 +340,25 @@ static void __capture1(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
>  	execbuf.flags = e->flags;
>  	if (gen > 3 && gen < 6)
>  		execbuf.flags |= I915_EXEC_SECURE;
> +	execbuf.flags |= I915_EXEC_FENCE_OUT;
>  	execbuf.rsvd1 = ctx->id;
> +	execbuf.rsvd2 = ~0UL;
>  
>  	igt_assert(!READ_ONCE(*seqno));
> -	gem_execbuf(fd, &execbuf);
> +	gem_execbuf_wr(fd, &execbuf);
> +
> +	fence_out = execbuf.rsvd2 >> 32;
> +	igt_assert(fence_out >= 0);
>  
>  	/* Wait for the request to start */
>  	while (READ_ONCE(*seqno) != 0xc0ffee)
>  		igt_assert(gem_bo_busy(fd, obj[SCRATCH].handle));
>  	munmap(seqno, 4096);
>  
> +	/* Wait for a reset to occur */
> +	wait_to_die(fence_out);
> +
>  	/* Check that only the buffer we marked is reported in the error */
> -	igt_force_gpu_reset(fd);
>  	memset(&offset, 0, sizeof(offset));
>  	offset.addr = obj[CAPTURE].offset;
>  	igt_assert_eq(check_error_state(dir, &offset, 1, target_size, false), 1);
> @@ -373,7 +405,8 @@ static int cmp(const void *A, const void *B)
>  static struct offset *
>  __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
>  	   const struct intel_execution_engine2 *e,
> -	   unsigned int size, int count, unsigned int flags)
> +	   unsigned int size, int count,
> +	   unsigned int flags, int *_fence_out)
>  #define INCREMENTAL 0x1
>  #define ASYNC 0x2
>  {
> @@ -383,7 +416,7 @@ __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
>  	struct drm_i915_gem_execbuffer2 execbuf;
>  	uint32_t *batch, *seqno;
>  	struct offset *offsets;
> -	int i;
> +	int i, fence_out;
>  
>  	configure_hangs(fd, e, ctx->id);
>  
> @@ -491,10 +524,17 @@ __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
>  	execbuf.flags = e->flags;
>  	if (gen > 3 && gen < 6)
>  		execbuf.flags |= I915_EXEC_SECURE;
> +	execbuf.flags |= I915_EXEC_FENCE_OUT;
>  	execbuf.rsvd1 = ctx->id;
> +	execbuf.rsvd2 = ~0UL;
>  
>  	igt_assert(!READ_ONCE(*seqno));
> -	gem_execbuf(fd, &execbuf);
> +	gem_execbuf_wr(fd, &execbuf);
> +
> +	fence_out = execbuf.rsvd2 >> 32;
> +	igt_assert(fence_out >= 0);
> +	if (_fence_out)
> +		*_fence_out = fence_out;
>  
>  	/* Wait for the request to start */
>  	while (READ_ONCE(*seqno) != 0xc0ffee)
> @@ -502,7 +542,7 @@ __captureN(int fd, int dir, uint64_t ahnd, const intel_ctx_t *ctx,
>  	munmap(seqno, 4096);
>  
>  	if (!(flags & ASYNC)) {
> -		igt_force_gpu_reset(fd);
> +		wait_to_die(fence_out);
>  		gem_sync(fd, obj[count + 1].handle);
>  	}
>  
> @@ -549,7 +589,7 @@ static void many(int fd, int dir, uint64_t size, unsigned int flags)
>  	intel_require_memory(count, size, CHECK_RAM);
>  	ahnd = get_reloc_ahnd(fd, ctx->id);
>  
> -	offsets = __captureN(fd, dir, ahnd, ctx, e, size, count, flags);
> +	offsets = __captureN(fd, dir, ahnd, ctx, e, size, count, flags, NULL);
>  
>  	blobs = check_error_state(dir, offsets, count, size, !!(flags & INCREMENTAL));
>  	igt_info("Captured %lu %"PRId64"-blobs out of a total of %lu\n",
> @@ -602,6 +642,7 @@ static void prioinv(int fd, int dir, const intel_ctx_t *ctx,
>  	igt_assert(pipe(link) == 0);
>  	igt_fork(child, 1) {
>  		const intel_ctx_t *ctx2;
> +		int fence_out;
>  		fd = gem_reopen_driver(fd);
>  		igt_debug("Submitting large capture [%ld x %dMiB objects]\n",
>  			  count, (int)(size >> 20));
> @@ -613,11 +654,11 @@ static void prioinv(int fd, int dir, const intel_ctx_t *ctx,
>  		/* Reopen the allocator in the new process. */
>  		ahnd = get_reloc_ahnd(fd, ctx2->id);
>  
> -		free(__captureN(fd, dir, ahnd, ctx2, e, size, count, ASYNC));
> +		free(__captureN(fd, dir, ahnd, ctx2, e, size, count, ASYNC, &fence_out));
>  		put_ahnd(ahnd);
>  
>  		write(link[1], &fd, sizeof(fd)); /* wake the parent up */
> -		igt_force_gpu_reset(fd);
> +		wait_to_die(fence_out);
>  		write(link[1], &fd, sizeof(fd)); /* wake the parent up */
>  	}
>  	read(link[0], &dummy, sizeof(dummy));
> @@ -714,7 +755,7 @@ igt_main
>  		gem_require_mmap_wc(fd);
>  		igt_require(has_capture(fd));
>  		ctx = intel_ctx_create_all_physical(fd);
> -		igt_allow_hang(fd, ctx->id, HANG_ALLOW_CAPTURE);
> +		igt_allow_hang(fd, ctx->id, HANG_ALLOW_CAPTURE | HANG_WANT_ENGINE_RESET);
>  
>  		dir = igt_sysfs_open(fd);
>  		igt_require(igt_sysfs_set(dir, "error", "Begone!"));
> -- 
> 2.25.1
>