Currently, we skip error capture upon forced preemption. We apply forced
preemption when there is a higher priority request that should be
running but is being blocked, and we skip inline error capture so that
the preemption request is not further delayed by a user controlled
capture -- extending the denial of service.
However, preemption reset is also used for heartbeats and regular GPU
hangs. By skipping the error capture, we remove the ability to debug GPU
hangs.
In order to capture the error without delaying the preemption request
further, we can do an out-of-line capture by removing the guilty request
from the execution queue and scheduling a work to dump that request.
When removing a request, we need to remove the entire context and all
descendants from the execution queue, so that they do not jump past.
Closes: https://gitlab.freedesktop.org/drm/intel/issues/738
Fixes: 3a7a92aba8fb ("drm/i915/execlists: Force preemption")
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
drivers/gpu/drm/i915/gt/intel_lrc.c | 120 +++++++++++++++++++++++++++-
1 file changed, 118 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 43c19dc9c0c7..a84477df32bd 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2392,7 +2392,6 @@ static void __execlists_hold(struct i915_request *rq)
} while(rq);
}
-__maybe_unused
static void execlists_hold(struct intel_engine_cs *engine,
struct i915_request *rq)
{
@@ -2472,7 +2471,6 @@ static void __execlists_unhold(struct i915_request *rq)
} while(rq);
}
-__maybe_unused
static void execlists_unhold(struct intel_engine_cs *engine,
struct i915_request *rq)
{
@@ -2492,6 +2490,121 @@ static void execlists_unhold(struct intel_engine_cs *engine,
spin_unlock_irq(&engine->active.lock);
}
+struct execlists_capture {
+ struct work_struct work;
+ struct i915_request *rq;
+ struct i915_gpu_coredump *error;
+};
+
+static void execlists_capture_work(struct work_struct *work)
+{
+ struct execlists_capture *cap = container_of(work, typeof(*cap), work);
+ const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
+ struct intel_engine_cs *engine = cap->rq->engine;
+ struct intel_gt_coredump *gt = cap->error->gt;
+ struct intel_engine_capture_vma *vma;
+
+ /* Compress all the objects attached to the request, slow! */
+ vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
+ if (vma) {
+ struct i915_vma_compress *compress =
+ i915_vma_capture_prepare(gt);
+
+ intel_engine_coredump_add_vma(gt->engine, vma, compress);
+ i915_vma_capture_finish(gt, compress);
+ }
+
+ gt->simulated = gt->engine->simulated;
+ cap->error->simulated = gt->simulated;
+
+ /* Publish the error state, and announce it to the world */
+ i915_error_state_store(cap->error);
+ i915_gpu_coredump_put(cap->error);
+
+ /* Return this request and all that depend upon it for signaling */
+ execlists_unhold(engine, cap->rq);
+
+ kfree(cap);
+}
+
+static struct i915_gpu_coredump *capture_regs(struct intel_engine_cs *engine)
+{
+ const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+ struct i915_gpu_coredump *e;
+
+ e = i915_gpu_coredump_alloc(engine->i915, gfp);
+ if (!e)
+ return NULL;
+
+ e->gt = intel_gt_coredump_alloc(engine->gt, gfp);
+ if (!e->gt)
+ goto err;
+
+ e->gt->engine = intel_engine_coredump_alloc(engine, gfp);
+ if (!e->gt->engine)
+ goto err_gt;
+
+ return e;
+
+err_gt:
+ kfree(e->gt);
+err:
+ kfree(e);
+ return NULL;
+}
+
+static void execlists_capture(struct intel_engine_cs *engine)
+{
+ struct execlists_capture *cap;
+
+ if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
+ return;
+
+ cap = kmalloc(sizeof(*cap), GFP_ATOMIC);
+ if (!cap)
+ return;
+
+ cap->rq = execlists_active(&engine->execlists);
+ GEM_BUG_ON(!cap->rq);
+
+ cap->rq = active_request(cap->rq->context->timeline, cap->rq);