[PATCH 3/3] iris: Handle GPU recovery

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> · Mon, 25 Mar 2019 10:59:00 +0000

We want to opt out of the automatic GPU recovery and replay performed by
the kernel of a guilty context after a GPU reset as our incremental
batch construction very often implies that subsequent batches are a GPU
reset are incomplete and will trigger fresh GPU hangs. As we are aware
of how we need to reset the context state, but the kernel isn't, tell
the kernel to cancel the inflight rendering and immediately report the
GPU hang, where upon we reconstruct a fresh context for the next batch.
---
 src/gallium/drivers/iris/iris_batch.c   | 92 ++++++++++++++++++-------
 src/gallium/drivers/iris/iris_batch.h   | 13 ++++
 src/gallium/drivers/iris/iris_bufmgr.c  | 25 +++++++
 src/gallium/drivers/iris/iris_context.c | 24 +++++--
 src/gallium/drivers/iris/iris_context.h | 12 ++--
 5 files changed, 125 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c
index 40bcd939795..2edca3d558f 100644
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@@ -163,6 +163,7 @@ iris_init_batch(struct iris_batch *batch,
                 struct iris_screen *screen,
                 struct iris_vtable *vtbl,
                 struct pipe_debug_callback *dbg,
+                iris_init_context_fn init_context,
                 struct iris_batch *all_batches,
                 uint32_t hw_id,
                 enum iris_batch_name name)
@@ -171,6 +172,7 @@ iris_init_batch(struct iris_batch *batch,
    batch->vtbl = vtbl;
    batch->dbg = dbg;
    batch->name = name;
+   batch->init_context = init_context;
 
    batch->hw_ctx_id = hw_id;
    batch->engine = name;
@@ -212,6 +214,8 @@ iris_init_batch(struct iris_batch *batch,
    }
 
    iris_batch_reset(batch);
+
+   batch->init_context(batch->screen, batch, batch->vtbl, batch->dbg);
 }
 
 static struct drm_i915_gem_exec_object2 *
@@ -443,6 +447,44 @@ iris_finish_batch(struct iris_batch *batch)
       batch->primary_batch_size = iris_batch_bytes_used(batch);
 }
 
+static int
+iris_recreate_context(struct iris_batch *batch)
+{
+   struct drm_i915_gem_context_create_ext_clone clone = {
+      .base = { .name = I915_CONTEXT_CREATE_EXT_CLONE },
+      .clone_id = batch->hw_ctx_id,
+      .flags = ~I915_CONTEXT_CLONE_UNKNOWN,
+   };
+   struct drm_i915_gem_context_create_ext arg = {
+      .flags = I915_CONTEXT_CREATE_FLAGS_USE_EXTENSIONS,
+      .extensions = (uintptr_t)&clone,
+   };
+   if (drm_ioctl(batch->screen->fd,
+                 DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT,
+                 &arg))
+      return -EIO;
+
+   uint32_t old_ctx_id = batch->hw_ctx_id;
+
+   batch->hw_ctx_id = arg.ctx_id;
+   batch->init_context(batch->screen, batch, batch->vtbl, batch->dbg);
+
+   for (int b = 0; b < ARRAY_SIZE(batch->other_batches); b++) {
+      struct iris_batch *other = batch->other_batches[b];
+      if (other->hw_ctx_id != old_ctx_id)
+         continue;
+
+      other->hw_ctx_id = arg.ctx_id;
+      other->init_context(other->screen, other, other->vtbl, other->dbg);
+   }
+
+   drm_ioctl(batch->screen->fd,
+             DRM_IOCTL_I915_GEM_CONTEXT_DESTROY,
+             &old_ctx_id);
+
+   return 0;
+}
+
 /**
  * Submit the batch to the GPU via execbuffer2.
  */
@@ -483,17 +525,11 @@ submit_batch(struct iris_batch *batch)
          (uintptr_t)util_dynarray_begin(&batch->exec_fences);
    }
 
-   int ret = drm_ioctl(batch->screen->fd,
-                       DRM_IOCTL_I915_GEM_EXECBUFFER2,
-                       &execbuf);
-   if (ret != 0) {
+   int ret = 0;
+   if (drm_ioctl(batch->screen->fd,
+                 DRM_IOCTL_I915_GEM_EXECBUFFER2,
+                 &execbuf))
       ret = -errno;
-      DBG("execbuf FAILED: errno = %d\n", -ret);
-      fprintf(stderr, "execbuf FAILED: errno = %d\n", -ret);
-      abort();
-   } else {
-      DBG("execbuf succeeded\n");
-   }
 
    for (int i = 0; i < batch->exec_count; i++) {
       struct iris_bo *bo = batch->exec_bos[i];
@@ -561,6 +597,25 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
 
    int ret = submit_batch(batch);
 
+   batch->exec_count = 0;
+   batch->aperture_space = 0;
+
+   struct iris_syncpt *syncpt =
+      ((struct iris_syncpt **) util_dynarray_begin(&batch->syncpts))[0];
+   iris_syncpt_reference(screen, &batch->last_syncpt, syncpt);
+
+   util_dynarray_foreach(&batch->syncpts, struct iris_syncpt *, s)
+      iris_syncpt_reference(screen, s, NULL);
+   util_dynarray_clear(&batch->syncpts);
+
+   util_dynarray_clear(&batch->exec_fences);
+
+   /* Start a new batch buffer. */
+   iris_batch_reset(batch);
+
+   if (ret == -EIO)
+      ret = iris_recreate_context(batch);
+
    if (ret >= 0) {
       //if (iris->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
          //iris_check_for_reset(ice);
@@ -574,25 +629,10 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)
       const bool color = INTEL_DEBUG & DEBUG_COLOR;
       fprintf(stderr, "%siris: Failed to submit batchbuffer: %-80s%s\n",
               color ? "\e[1;41m" : "", strerror(-ret), color ? "\e[0m" : "");
-      abort();
 #endif
+      abort();
    }
 
-   batch->exec_count = 0;
-   batch->aperture_space = 0;
-
-   struct iris_syncpt *syncpt =
-      ((struct iris_syncpt **) util_dynarray_begin(&batch->syncpts))[0];
-   iris_syncpt_reference(screen, &batch->last_syncpt, syncpt);
-
-   util_dynarray_foreach(&batch->syncpts, struct iris_syncpt *, s)
-      iris_syncpt_reference(screen, s, NULL);
-   util_dynarray_clear(&batch->syncpts);
-
-   util_dynarray_clear(&batch->exec_fences);
-
-   /* Start a new batch buffer. */
-   iris_batch_reset(batch);
 }
 
 /**
diff --git a/src/gallium/drivers/iris/iris_batch.h b/src/gallium/drivers/iris/iris_batch.h
index db1a4cbbe11..da94749fe20 100644
--- a/src/gallium/drivers/iris/iris_batch.h
+++ b/src/gallium/drivers/iris/iris_batch.h
@@ -41,6 +41,16 @@
 /* Our target batch size - flush approximately at this point. */
 #define BATCH_SZ (20 * 1024)
 
+struct iris_screen;
+struct iris_batch;
+struct iris_vtable;
+struct pipe_debug_callback;
+
+typedef void (*iris_init_context_fn)(struct iris_screen *screen,
+                                     struct iris_batch *batch,
+                                     struct iris_vtable *vtbl,
+                                     struct pipe_debug_callback *dbg);
+
 enum iris_batch_name {
    IRIS_BATCH_RENDER,
    IRIS_BATCH_COMPUTE,
@@ -124,12 +134,15 @@ struct iris_batch {
 
    /** Have we emitted any draw calls to this batch? */
    bool contains_draw;
+
+   iris_init_context_fn init_context;
 };
 
 void iris_init_batch(struct iris_batch *batch,
                      struct iris_screen *screen,
                      struct iris_vtable *vtbl,
                      struct pipe_debug_callback *dbg,
+                     iris_init_context_fn init_context,
                      struct iris_batch *all_batches,
                      uint32_t hw_id,
                      enum iris_batch_name name);
diff --git a/src/gallium/drivers/iris/iris_bufmgr.c b/src/gallium/drivers/iris/iris_bufmgr.c
index e0d167913d2..4c198063869 100644
--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@@ -1548,6 +1548,29 @@ init_cache_buckets(struct iris_bufmgr *bufmgr)
    }
 }
 
+static void
+init_context(struct iris_bufmgr *bufmgr, uint32_t ctx_id)
+{
+   /*
+    * Upon declaring a GPU hang, the kernel will zap the guilty context
+    * back to the default logical HW state and attempt to continue on to
+    * our next submitted batchbuffer. However, we only send incremental
+    * logical state (e.g. we only ever setup invariant register state
+    * once in brw_initial_gpu_upload()) and so attempting to reply the
+    * next batchbuffer without the correct logical state can be fatal.
+    * Here we tell the kernel not to attempt to recover our context but
+    * immediately (on the next batchbuffer submission) report that the
+    * context is lost, and we will do the recovery ourselves -- 2 lost
+    * batches instead of a continual stream until we are banned, or the
+    * machine is dead.
+    */
+   struct drm_i915_gem_context_param p = {
+      .ctx_id = ctx_id,
+      .param = I915_CONTEXT_PARAM_RECOVERABLE,
+   };
+   drm_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, &p);
+}
+
 uint32_t
 iris_create_hw_context(struct iris_bufmgr *bufmgr)
 {
@@ -1558,6 +1581,8 @@ iris_create_hw_context(struct iris_bufmgr *bufmgr)
       return 0;
    }
 
+   init_context(bufmgr, create.ctx_id);
+
    return create.ctx_id;
 }
 
diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c
index aeb608c70bd..9e24733def7 100644
--- a/src/gallium/drivers/iris/iris_context.c
+++ b/src/gallium/drivers/iris/iris_context.c
@@ -162,11 +162,21 @@ static void create_hw_contexts(struct iris_screen *screen,
          .size = sizeof(engines),
       },
    };
-   struct drm_i915_gem_context_create_ext_setparam p_prio = {
+   struct drm_i915_gem_context_create_ext_setparam p_recover = {
       .base = {
          .name =I915_CONTEXT_CREATE_EXT_SETPARAM,
          .next_extension = (uintptr_t)&p_engines,
       },
+      .param = {
+         .param = I915_CONTEXT_PARAM_RECOVERABLE,
+         .value = 0,
+      },
+   };
+   struct drm_i915_gem_context_create_ext_setparam p_prio = {
+      .base = {
+         .name =I915_CONTEXT_CREATE_EXT_SETPARAM,
+         .next_extension = (uintptr_t)&p_recover,
+      },
       .param = {
          .param = I915_CONTEXT_PARAM_PRIORITY,
          .value = priority,
@@ -262,16 +272,16 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
    if (flags & PIPE_CONTEXT_LOW_PRIORITY)
       priority = GEN_CONTEXT_LOW_PRIORITY;
 
+   iris_init_context_fn initfn[] = {
+      [IRIS_BATCH_RENDER]  = ice->vtbl.init_render_context,
+      [IRIS_BATCH_COMPUTE] = ice->vtbl.init_compute_context,
+   };
    create_hw_contexts(screen, hw_id, priority);
    for (int i = 0; i < IRIS_BATCH_COUNT; i++) {
-      iris_init_batch(&ice->batches[i], screen, &ice->vtbl, &ice->dbg,
+      iris_init_batch(&ice->batches[i], screen,
+                      &ice->vtbl, &ice->dbg, initfn[i],
                       ice->batches, hw_id[i], (enum iris_batch_name) i);
    }
 
-   ice->vtbl.init_render_context(screen, &ice->batches[IRIS_BATCH_RENDER],
-                                 &ice->vtbl, &ice->dbg);
-   ice->vtbl.init_compute_context(screen, &ice->batches[IRIS_BATCH_COMPUTE],
-                                  &ice->vtbl, &ice->dbg);
-
    return ctx;
 }
diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h
index 494c931d0f0..d64d391e98a 100644
--- a/src/gallium/drivers/iris/iris_context.h
+++ b/src/gallium/drivers/iris/iris_context.h
@@ -346,14 +346,10 @@ struct iris_stream_output_target {
  */
 struct iris_vtable {
    void (*destroy_state)(struct iris_context *ice);
-   void (*init_render_context)(struct iris_screen *screen,
-                               struct iris_batch *batch,
-                               struct iris_vtable *vtbl,
-                               struct pipe_debug_callback *dbg);
-   void (*init_compute_context)(struct iris_screen *screen,
-                                struct iris_batch *batch,
-                                struct iris_vtable *vtbl,
-                                struct pipe_debug_callback *dbg);
+
+   iris_init_context_fn init_render_context;
+   iris_init_context_fn init_compute_context;
+
    void (*upload_render_state)(struct iris_context *ice,
                                struct iris_batch *batch,
                                const struct pipe_draw_info *draw);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx