From: Chris Wilson <chris.p.wilson@xxxxxxxxxxxxxxx> The kernel context was poisoned on wakeup to simulate how the driver would cope with bad HW that caused corruption of any context that was still resident during power loss, see commit 1d0e2c9359fe ("drm/i915/gt: Always poison the kernel_context image before unparking"). However clearing the entire context image also poisons the ppHWSP which may contain pertinent information (such as the breadcrumb and context switches) that we want to preserve. We could restrict the poisoning to exclude the ppHWSP, or more simply recognise that we have poisoned the HW enough and can leave the verification to after suspend/resume. References: 1d0e2c9359fe ("drm/i915/gt: Always poison the kernel_context image before unparking") Signed-off-by: Chris Wilson <chris.p.wilson@xxxxxxxxxxxxxxx> Signed-off-by: Krzysztof Niemiec <krzysztof.niemiec@xxxxxxxxx> --- .../gpu/drm/i915/gt/intel_engine_heartbeat.c | 4 ++++ drivers/gpu/drm/i915/gt/intel_engine_pm.c | 24 ------------------- drivers/gpu/drm/i915/gt/intel_lrc.c | 12 ++++++---- 3 files changed, 12 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c index 8d4bb95f8424..7d69bc496283 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c @@ -127,6 +127,10 @@ reset_engine(struct intel_engine_cs *engine, struct i915_request *rq) */ intel_guc_find_hung_context(engine); + /* If the heartbeat failed to resume after reset, declare an emergency. */ + if (xchg(&rq->fence.error, -ENODEV) == -ENODEV) + intel_gt_set_wedged(engine->gt); + intel_gt_handle_error(engine->gt, engine->mask, I915_ERROR_CAPTURE, "stopped heartbeat on %s", diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index fb7bff27b45a..a5c42b784168 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -59,35 +59,11 @@ static int __engine_unpark(struct intel_wakeref *wf) { struct intel_engine_cs *engine = container_of(wf, typeof(*engine), wakeref); - struct intel_context *ce; ENGINE_TRACE(engine, "\n"); engine->wakeref_track = intel_gt_pm_get(engine->gt); - /* Discard stale context state from across idling */ - ce = engine->kernel_context; - if (ce) { - GEM_BUG_ON(test_bit(CONTEXT_VALID_BIT, &ce->flags)); - - /* Flush all pending HW writes before we touch the context */ - while (unlikely(intel_context_inflight(ce))) - intel_engine_flush_submission(engine); - - /* First poison the image to verify we never fully trust it */ - dbg_poison_ce(ce); - - /* Scrub the context image after our loss of control */ - ce->ops->reset(ce); - - CE_TRACE(ce, "reset { seqno:%x, *hwsp:%x, ring:%x }\n", - ce->timeline->seqno, - READ_ONCE(*ce->timeline->hwsp_seqno), - ce->ring->emit); - GEM_BUG_ON(ce->timeline->seqno != - READ_ONCE(*ce->timeline->hwsp_seqno)); - } - if (engine->unpark) engine->unpark(engine); diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 7bd5d2c29056..f742707b703e 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1017,15 +1017,19 @@ void lrc_init_state(struct intel_context *ce, set_redzone(state, engine); + /* Clear the ppHWSP (inc. per-context counters) */ + if (!test_bit(CONTEXT_VALID_BIT, &ce->flags)) + memset(state, 0, LRC_STATE_OFFSET); + if (ce->default_state) { - shmem_read(ce->default_state, 0, state, engine->context_size); + shmem_read(ce->default_state, /* exclude ppHWSP */ + LRC_STATE_OFFSET, + state + LRC_STATE_OFFSET, + engine->context_size - LRC_STATE_OFFSET); __set_bit(CONTEXT_VALID_BIT, &ce->flags); inhibit = false; } - /* Clear the ppHWSP (inc. per-context counters) */ - memset(state, 0, PAGE_SIZE); - /* Clear the indirect wa and storage */ if (ce->wa_bb_page) memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); -- 2.43.0