Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Now that we have offline error capture and can reset an engine from > inside an atomic context while also preserving the GPU state for > post-mortem analysis, it is time to handle error interrupts thrown by > the command parser. You might want to add an advertisement here that this way the detection/recovery speed of cs errors is greatly improved. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > --- > drivers/gpu/drm/i915/gt/intel_engine_cs.c | 8 +- > drivers/gpu/drm/i915/gt/intel_engine_types.h | 10 ++ > drivers/gpu/drm/i915/gt/intel_gt.c | 5 + > drivers/gpu/drm/i915/gt/intel_gt_irq.c | 27 ++- > drivers/gpu/drm/i915/gt/intel_lrc.c | 57 +++++-- > drivers/gpu/drm/i915/gt/selftest_lrc.c | 166 +++++++++++++++++-- > drivers/gpu/drm/i915/i915_gpu_error.c | 2 + > drivers/gpu/drm/i915/i915_gpu_error.h | 1 + > drivers/gpu/drm/i915/i915_reg.h | 5 +- > 9 files changed, 246 insertions(+), 35 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > index 9b965d1f811d..39fe9a5b4820 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > @@ -1293,8 +1293,14 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine, > } > > if (INTEL_GEN(dev_priv) >= 6) { > - drm_printf(m, "\tRING_IMR: %08x\n", > + drm_printf(m, "\tRING_IMR: 0x%08x\n", > ENGINE_READ(engine, RING_IMR)); > + drm_printf(m, "\tRING_ESR: 0x%08x\n", > + ENGINE_READ(engine, RING_ESR)); > + drm_printf(m, "\tRING_EMR: 0x%08x\n", > + ENGINE_READ(engine, RING_EMR)); > + drm_printf(m, "\tRING_EIR: 0x%08x\n", > + ENGINE_READ(engine, RING_EIR)); > } > > addr = intel_engine_get_active_head(engine); > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h > index 92be41a6903c..abd1de3b83a8 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > @@ -156,6 +156,16 @@ struct intel_engine_execlists { > */ > struct i915_priolist default_priolist; > > + /** > + * @error_interrupt: CS Master EIR > + * > + * The CS generates an interrupt when it detects an error. We capture > + * the first error interrupt, record the EIR and schedule the tasklet. > + * In the tasklet, we process the pending CS events to ensure we have > + * the guilty request, and then reset the engine. > + */ > + u32 error_interrupt; > + > /** > * @no_priolist: priority lists disabled > */ > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c > index 88b6c904607c..143268083135 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt.c > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c > @@ -455,6 +455,11 @@ static int __engines_record_defaults(struct intel_gt *gt) > if (!rq) > continue; > > + if (rq->fence.error) { > + err = -EIO; > + goto out; > + } > + > GEM_BUG_ON(!test_bit(CONTEXT_ALLOC_BIT, &rq->context->flags)); > state = rq->context->state; > if (!state) > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c > index 7278b10e1a03..864efaf1eb37 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c > +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c > @@ -24,6 +24,21 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir) > { > bool tasklet = false; > > + if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) { > + u32 eir; > + > + eir = ENGINE_READ(engine, RING_EIR); > + ENGINE_TRACE(engine, "CS error: %x\n", eir); > + > + /* Disable the error interrupt until after the reset */ > + if (likely(eir)) { > + ENGINE_WRITE(engine, RING_EMR, ~0u); > + ENGINE_WRITE(engine, RING_EIR, eir); > + engine->execlists.error_interrupt = eir; WRITE_ONCE > + tasklet = true; > + } > + } > + > if (iir & GT_CONTEXT_SWITCH_INTERRUPT) > tasklet = true; > > @@ -210,7 +225,10 @@ void gen11_gt_irq_reset(struct intel_gt *gt) > > void gen11_gt_irq_postinstall(struct intel_gt *gt) > { > - const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT; > + const u32 irqs = > + GT_CS_MASTER_ERROR_INTERRUPT | > + GT_RENDER_USER_INTERRUPT | > + GT_CONTEXT_SWITCH_INTERRUPT; > struct intel_uncore *uncore = gt->uncore; > const u32 dmask = irqs << 16 | irqs; > const u32 smask = irqs << 16; > @@ -279,7 +297,7 @@ void gen6_gt_irq_handler(struct intel_gt *gt, u32 gt_iir) > > if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT | > GT_BSD_CS_ERROR_INTERRUPT | > - GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) > + GT_CS_MASTER_ERROR_INTERRUPT)) > DRM_DEBUG("Command parser error, gt_iir 0x%08x\n", gt_iir); > > if (gt_iir & GT_PARITY_ERROR(gt->i915)) > @@ -345,7 +363,10 @@ void gen8_gt_irq_reset(struct intel_gt *gt) > void gen8_gt_irq_postinstall(struct intel_gt *gt) > { > /* These are interrupts we'll toggle with the ring mask register */ > - const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT; > + const u32 irqs = > + GT_CS_MASTER_ERROR_INTERRUPT | > + GT_RENDER_USER_INTERRUPT | > + GT_CONTEXT_SWITCH_INTERRUPT; > const u32 gt_interrupts[] = { > irqs << GEN8_RCS_IRQ_SHIFT | irqs << GEN8_BCS_IRQ_SHIFT, > irqs << GEN8_VCS0_IRQ_SHIFT | irqs << GEN8_VCS1_IRQ_SHIFT, > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > index cf6c43bd540a..1b6aab05fde1 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > @@ -2613,13 +2613,13 @@ static bool execlists_capture(struct intel_engine_cs *engine) > if (!cap) > return true; > > + spin_lock_irq(&engine->active.lock); > cap->rq = execlists_active(&engine->execlists); > - GEM_BUG_ON(!cap->rq); > - > - rcu_read_lock(); > - cap->rq = active_request(cap->rq->context->timeline, cap->rq); > - cap->rq = i915_request_get_rcu(cap->rq); > - rcu_read_unlock(); > + if (cap->rq) { > + cap->rq = active_request(cap->rq->context->timeline, cap->rq); > + cap->rq = i915_request_get_rcu(cap->rq); > + } > + spin_unlock_irq(&engine->active.lock); > if (!cap->rq) > goto err_free; > > @@ -2658,27 +2658,25 @@ static bool execlists_capture(struct intel_engine_cs *engine) > return false; > } > > -static noinline void preempt_reset(struct intel_engine_cs *engine) > +static void execlists_reset(struct intel_engine_cs *engine, const char *msg) > { > const unsigned int bit = I915_RESET_ENGINE + engine->id; > unsigned long *lock = &engine->gt->reset.flags; > > - if (i915_modparams.reset < 3) > + if (!intel_has_reset_engine(engine->gt)) > return; > > if (test_and_set_bit(bit, lock)) > return; > > + ENGINE_TRACE(engine, "reset for %s\n", msg); > + > /* Mark this tasklet as disabled to avoid waiting for it to complete */ > tasklet_disable_nosync(&engine->execlists.tasklet); > > - ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n", > - READ_ONCE(engine->props.preempt_timeout_ms), > - jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); > - > ring_set_paused(engine, 1); /* Freeze the current request in place */ > if (execlists_capture(engine)) > - intel_engine_reset(engine, "preemption time out"); > + intel_engine_reset(engine, msg); > else > ring_set_paused(engine, 0); > > @@ -2709,6 +2707,13 @@ static void execlists_submission_tasklet(unsigned long data) > bool timeout = preempt_timeout(engine); > > process_csb(engine); > + > + if (unlikely(engine->execlists.error_interrupt)) { READ_ONCE > + engine->execlists.error_interrupt = 0; The race looks bening as this is only for us. > + if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */ > + execlists_reset(engine, "CS error"); > + } > + > if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { > unsigned long flags; > > @@ -2717,8 +2722,8 @@ static void execlists_submission_tasklet(unsigned long data) > spin_unlock_irqrestore(&engine->active.lock, flags); > > /* Recheck after serialising with direct-submission */ > - if (timeout && preempt_timeout(engine)) > - preempt_reset(engine); > + if (unlikely(timeout && preempt_timeout(engine))) > + execlists_reset(engine, "preemption time out"); > } > } > > @@ -3343,6 +3348,25 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine) > return ret; > } > > +static void enable_error_interrupt(struct intel_engine_cs *engine) > +{ > + u32 status; > + > + engine->execlists.error_interrupt = 0; > + ENGINE_WRITE(engine, RING_EMR, ~0u); > + ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ > + > + status = ENGINE_READ(engine, RING_ESR); > + if (unlikely(status)) { > + dev_err(engine->i915->drm.dev, > + "engine '%s' resumed still in error: %08x\n", > + engine->name, status); > + __intel_gt_reset(engine->gt, engine->mask); > + } > + > + ENGINE_WRITE(engine, RING_EMR, ~REG_BIT(0)); Please do define this bit. > +} > + > static void enable_execlists(struct intel_engine_cs *engine) > { > u32 mode; > @@ -3364,6 +3388,8 @@ static void enable_execlists(struct intel_engine_cs *engine) > i915_ggtt_offset(engine->status_page.vma)); > ENGINE_POSTING_READ(engine, RING_HWS_PGA); > > + enable_error_interrupt(engine); > + > engine->context_tag = 0; > } > > @@ -4290,6 +4316,7 @@ logical_ring_default_irqs(struct intel_engine_cs *engine) > > engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; > engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; > + engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; > } > > static void rcs_submission_override(struct intel_engine_cs *engine) > diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c > index 65718ca2326e..84bac953d1b7 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c > +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c > @@ -68,6 +68,21 @@ static void engine_heartbeat_enable(struct intel_engine_cs *engine, > engine->props.heartbeat_interval_ms = saved; > } > > +static int wait_for_submit(struct intel_engine_cs *engine, > + struct i915_request *rq, > + unsigned long timeout) > +{ > + timeout += jiffies; > + do { > + cond_resched(); > + intel_engine_flush_submission(engine); > + if (i915_request_is_active(rq)) If this would not be just moving, I would ask what if the request whizzed by to inactivity between our sample points. > + return 0; > + } while (time_before(jiffies, timeout)); > + > + return -ETIME; > +} > + > static int live_sanitycheck(void *arg) > { > struct intel_gt *gt = arg; > @@ -386,6 +401,141 @@ static int live_hold_reset(void *arg) > return err; > } > > +static const char *error_repr(int err) > +{ > + return err ? "bad" : "good"; > +} > + > +static int live_error_interrupt(void *arg) > +{ > + static const struct error_phase { > + enum { GOOD = 0, BAD = -EIO } error[2]; > + } phases[] = { > + { { BAD, GOOD } }, > + { { BAD, BAD } }, > + { { BAD, GOOD } }, > + { { GOOD, GOOD } }, /* sentinel */ > + }; > + struct intel_gt *gt = arg; > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + > + /* > + * We hook up the CS_MASTER_ERROR_INTERRUPT to have forewarning > + * of invalid commands in user batches that will cause a GPU hang. > + * This is a faster mechanism than using hangcheck/heartbeats, but > + * only detects problems the HW knows about -- it will not warn when > + * we kill the HW! > + * > + * To verify our detection and reset, we throw some invalid commands > + * at the HW and wait for the interrupt. > + */ > + > + if (!intel_has_reset_engine(gt)) > + return 0; > + > + for_each_engine(engine, gt, id) { > + const struct error_phase *p; > + unsigned long heartbeat; > + > + engine_heartbeat_disable(engine, &heartbeat); > + > + for (p = phases; p->error[0] != GOOD; p++) { > + struct i915_request *client[ARRAY_SIZE(phases->error)]; > + int err = 0, i; > + u32 *cs; > + > + memset(client, 0, sizeof(*client)); > + for (i = 0; i < ARRAY_SIZE(client); i++) { > + struct intel_context *ce; > + struct i915_request *rq; > + > + ce = intel_context_create(engine); > + if (IS_ERR(ce)) { > + err = PTR_ERR(ce); > + goto out; > + } > + > + rq = intel_context_create_request(ce); > + intel_context_put(ce); > + if (IS_ERR(rq)) { > + err = PTR_ERR(rq); > + goto out; > + } > + > + if (rq->engine->emit_init_breadcrumb) { > + err = rq->engine->emit_init_breadcrumb(rq); > + if (err) { > + i915_request_add(rq); > + goto out; > + } > + } > + > + cs = intel_ring_begin(rq, 2); > + if (IS_ERR(cs)) { > + i915_request_add(rq); > + err = PTR_ERR(cs); > + goto out; > + } > + > + if (p->error[i]) { Was going to nag about enums but on second read, it reads just fine. Defining the REG(0) and some WRITE_ONCE/READ_ONCE salt added on top, Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > + *cs++ = 0xdeadbeef; > + *cs++ = 0xdeadbeef; > + } else { > + *cs++ = MI_NOOP; > + *cs++ = MI_NOOP; > + } > + > + client[i] = i915_request_get(rq); > + i915_request_add(rq); > + } > + > + err = wait_for_submit(engine, client[0], HZ / 2); > + if (err) { > + pr_err("%s: first request did not start within time!\n", > + engine->name); > + err = -ETIME; > + goto out; > + } > + > + for (i = 0; i < ARRAY_SIZE(client); i++) { > + if (i915_request_wait(client[i], 0, HZ / 5) < 0) { > + pr_err("%s: %s request still executing!\n", > + engine->name, > + error_repr(p->error[i])); > + err = -ETIME; > + goto out; > + } > + > + if (client[i]->fence.error != p->error[i]) { > + pr_err("%s: %s request completed with wrong error code: %d\n", > + engine->name, > + error_repr(p->error[i]), > + client[i]->fence.error); > + err = -EINVAL; > + goto out; > + } > + } > + > +out: > + for (i = 0; i < ARRAY_SIZE(client); i++) > + if (client[i]) > + i915_request_put(client[i]); > + if (err) { > + pr_err("%s: failed at phase[%zd] { %d, %d }\n", > + engine->name, p - phases, > + p->error[0], p->error[1]); > + intel_gt_set_wedged(gt); > + return err; > + } > + } > + > + engine_heartbeat_enable(engine, heartbeat); > + } > + > + return 0; > +} > + > static int > emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx) > { > @@ -628,21 +778,6 @@ static struct i915_request *nop_request(struct intel_engine_cs *engine) > return rq; > } > > -static int wait_for_submit(struct intel_engine_cs *engine, > - struct i915_request *rq, > - unsigned long timeout) > -{ > - timeout += jiffies; > - do { > - cond_resched(); > - intel_engine_flush_submission(engine); > - if (i915_request_is_active(rq)) > - return 0; > - } while (time_before(jiffies, timeout)); > - > - return -ETIME; > -} > - > static long timeslice_threshold(const struct intel_engine_cs *engine) > { > return 2 * msecs_to_jiffies_timeout(timeslice(engine)) + 1; > @@ -3572,6 +3707,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) > SUBTEST(live_unlite_switch), > SUBTEST(live_unlite_preempt), > SUBTEST(live_hold_reset), > + SUBTEST(live_error_interrupt), > SUBTEST(live_timeslice_preempt), > SUBTEST(live_timeslice_queue), > SUBTEST(live_busywait_preempt), > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 0f67bef83106..dcab4723b17d 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -515,6 +515,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, > (u32)(ee->acthd>>32), (u32)ee->acthd); > err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir); > err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr); > + err_printf(m, " ESR: 0x%08x\n", ee->esr); > > error_print_instdone(m, ee); > > @@ -1102,6 +1103,7 @@ static void engine_record_registers(struct intel_engine_coredump *ee) > } > > if (INTEL_GEN(i915) >= 4) { > + ee->esr = ENGINE_READ(engine, RING_ESR); > ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); > ee->ipeir = ENGINE_READ(engine, RING_IPEIR); > ee->ipehr = ENGINE_READ(engine, RING_IPEHR); > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h > index e4a6afed3bbf..b35bc9edd733 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.h > +++ b/drivers/gpu/drm/i915/i915_gpu_error.h > @@ -75,6 +75,7 @@ struct intel_engine_coredump { > u32 hws; > u32 ipeir; > u32 ipehr; > + u32 esr; > u32 bbstate; > u32 instpm; > u32 instps; > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index b93c4c18f05c..4c72b8ac0f2e 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -2639,6 +2639,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) > #define GEN11_MCR_SUBSLICE_MASK GEN11_MCR_SUBSLICE(0x7) > #define RING_IPEIR(base) _MMIO((base) + 0x64) > #define RING_IPEHR(base) _MMIO((base) + 0x68) > +#define RING_EIR(base) _MMIO((base) + 0xb0) > +#define RING_EMR(base) _MMIO((base) + 0xb4) > +#define RING_ESR(base) _MMIO((base) + 0xb8) > /* > * On GEN4, only the render ring INSTDONE exists and has a different > * layout than the GEN7+ version. > @@ -3088,7 +3091,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) > #define GT_CONTEXT_SWITCH_INTERRUPT (1 << 8) > #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT (1 << 5) /* !snb */ > #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT (1 << 4) > -#define GT_RENDER_CS_MASTER_ERROR_INTERRUPT (1 << 3) > +#define GT_CS_MASTER_ERROR_INTERRUPT REG_BIT(3) > #define GT_RENDER_SYNC_STATUS_INTERRUPT (1 << 2) > #define GT_RENDER_DEBUG_INTERRUPT (1 << 1) > #define GT_RENDER_USER_INTERRUPT (1 << 0) > -- > 2.25.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx