In order to simplify hangcheck state keeping, split hangcheck per engine loop in three phases: state load, action, state save. Add few more hangcheck actions to separate between seqno, head and subunit movements. This helps to gather all the hangcheck actions under a single switch umbrella. Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_gpu_error.c | 8 +- drivers/gpu/drm/i915/intel_hangcheck.c | 241 ++++++++++++++++++-------------- drivers/gpu/drm/i915/intel_ringbuffer.h | 4 +- 3 files changed, 146 insertions(+), 107 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 5d620bd..f02f581 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a) return "idle"; case HANGCHECK_WAIT: return "wait"; - case HANGCHECK_ACTIVE: - return "active"; + case HANGCHECK_ACTIVE_SEQNO: + return "active seqno"; + case HANGCHECK_ACTIVE_HEAD: + return "active head"; + case HANGCHECK_ACTIVE_SUBUNITS: + return "active subunits"; case HANGCHECK_KICK: return "kick"; case HANGCHECK_HUNG: diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 53df5b1..3d2e81c 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd) memset(&engine->hangcheck.instdone, 0, sizeof(engine->hangcheck.instdone)); - return HANGCHECK_ACTIVE; + return HANGCHECK_ACTIVE_HEAD; } if (!subunits_stuck(engine)) - return HANGCHECK_ACTIVE; + return HANGCHECK_ACTIVE_SUBUNITS; return HANGCHECK_HUNG; } @@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) return HANGCHECK_HUNG; } +static void hangcheck_load_sample(struct intel_engine_cs *engine, + struct intel_engine_hangcheck *hc) +{ + /* We don't strictly need an irq-barrier here, as we are not + * serving an interrupt request, be paranoid in case the + * barrier has side-effects (such as preventing a broken + * cacheline snoop) and so be sure that we can see the seqno + * advance. If the seqno should stick, due to a stale + * cacheline, we would erroneously declare the GPU hung. + */ + if (engine->irq_seqno_barrier) + engine->irq_seqno_barrier(engine); + + hc->acthd = intel_engine_get_active_head(engine); + hc->seqno = intel_engine_get_seqno(engine); + hc->score = engine->hangcheck.score; +} + +static void hangcheck_store_sample(struct intel_engine_cs *engine, + const struct intel_engine_hangcheck *hc) +{ + engine->hangcheck.acthd = hc->acthd; + engine->hangcheck.seqno = hc->seqno; + engine->hangcheck.score = hc->score; + engine->hangcheck.action = hc->action; +} + +static enum intel_engine_hangcheck_action +hangcheck_get_action(struct intel_engine_cs *engine, + const struct intel_engine_hangcheck *hc) +{ + if (engine->hangcheck.seqno != hc->seqno) + return HANGCHECK_ACTIVE_SEQNO; + + if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine))) + return HANGCHECK_IDLE; + + return engine_stuck(engine, hc->acthd); +} + +static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, + struct intel_engine_hangcheck *hc) +{ + hc->action = hangcheck_get_action(engine, hc); + + switch (hc->action) { + case HANGCHECK_IDLE: + case HANGCHECK_WAIT: + break; + + case HANGCHECK_ACTIVE_HEAD: + case HANGCHECK_ACTIVE_SUBUNITS: + /* We always increment the hangcheck score + * if the engine is busy and still processing + * the same request, so that no single request + * can run indefinitely (such as a chain of + * batches). The only time we do not increment + * the hangcheck score on this ring, if this + * engine is in a legitimate wait for another + * engine. In that case the waiting engine is a + * victim and we want to be sure we catch the + * right culprit. Then every time we do kick + * the ring, add a small increment to the + * score so that we can catch a batch that is + * being repeatedly kicked and so responsible + * for stalling the machine. + */ + hc->score += 1; + break; + + case HANGCHECK_KICK: + hc->score += 5; + break; + + case HANGCHECK_HUNG: + hc->score += 20; + break; + + case HANGCHECK_ACTIVE_SEQNO: + /* Gradually reduce the count so that we catch DoS + * attempts across multiple batches. + */ + if (hc->score > 0) + hc->score -= 15; + if (hc->score < 0) + hc->score = 0; + + /* Clear head and subunit states on seqno movement */ + hc->acthd = 0; + + memset(&engine->hangcheck.instdone, 0, + sizeof(engine->hangcheck.instdone)); + break; + + default: + MISSING_CASE(hc->action); + } +} + +static void hangcheck_declare_hang(struct drm_i915_private *i915, + unsigned int hung, + unsigned int stuck) +{ + struct intel_engine_cs *engine; + char msg[80]; + unsigned int tmp; + int len; + + /* If some rings hung but others were still busy, only + * blame the hanging rings in the synopsis. + */ + if (stuck != hung) + hung &= ~stuck; + len = scnprintf(msg, sizeof(msg), + "%s on ", stuck == hung ? "No progress" : "Hang"); + for_each_engine_masked(engine, i915, hung, tmp) + len += scnprintf(msg + len, sizeof(msg) - len, + "%s, ", engine->name); + msg[len-2] = '\0'; + + return i915_handle_error(i915, hung, msg); +} + /* * This is called when the chip hasn't reported back with completed * batchbuffers in a long time. We keep track per ring seqno progress and @@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work) enum intel_engine_id id; unsigned int hung = 0, stuck = 0; int busy_count = 0; -#define BUSY 1 -#define KICK 5 -#define HUNG 20 -#define ACTIVE_DECAY 15 if (!i915.enable_hangcheck) return; @@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work) intel_uncore_arm_unclaimed_mmio_detection(dev_priv); for_each_engine(engine, dev_priv, id) { - bool busy = intel_engine_has_waiter(engine); - u64 acthd; - u32 seqno; - u32 submit; + struct intel_engine_hangcheck cur_state, *hc = &cur_state; + const bool busy = intel_engine_has_waiter(engine); semaphore_clear_deadlocks(dev_priv); - /* We don't strictly need an irq-barrier here, as we are not - * serving an interrupt request, be paranoid in case the - * barrier has side-effects (such as preventing a broken - * cacheline snoop) and so be sure that we can see the seqno - * advance. If the seqno should stick, due to a stale - * cacheline, we would erroneously declare the GPU hung. - */ - if (engine->irq_seqno_barrier) - engine->irq_seqno_barrier(engine); - - acthd = intel_engine_get_active_head(engine); - seqno = intel_engine_get_seqno(engine); - submit = intel_engine_last_submit(engine); - - if (engine->hangcheck.seqno == seqno) { - if (i915_seqno_passed(seqno, submit)) { - engine->hangcheck.action = HANGCHECK_IDLE; - } else { - /* We always increment the hangcheck score - * if the engine is busy and still processing - * the same request, so that no single request - * can run indefinitely (such as a chain of - * batches). The only time we do not increment - * the hangcheck score on this ring, if this - * engine is in a legitimate wait for another - * engine. In that case the waiting engine is a - * victim and we want to be sure we catch the - * right culprit. Then every time we do kick - * the ring, add a small increment to the - * score so that we can catch a batch that is - * being repeatedly kicked and so responsible - * for stalling the machine. - */ - engine->hangcheck.action = - engine_stuck(engine, acthd); - - switch (engine->hangcheck.action) { - case HANGCHECK_IDLE: - case HANGCHECK_WAIT: - break; - case HANGCHECK_ACTIVE: - engine->hangcheck.score += BUSY; - break; - case HANGCHECK_KICK: - engine->hangcheck.score += KICK; - break; - case HANGCHECK_HUNG: - engine->hangcheck.score += HUNG; - break; - } - } - - if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) { - hung |= intel_engine_flag(engine); - if (engine->hangcheck.action != HANGCHECK_HUNG) - stuck |= intel_engine_flag(engine); - } - } else { - engine->hangcheck.action = HANGCHECK_ACTIVE; - - /* Gradually reduce the count so that we catch DoS - * attempts across multiple batches. - */ - if (engine->hangcheck.score > 0) - engine->hangcheck.score -= ACTIVE_DECAY; - if (engine->hangcheck.score < 0) - engine->hangcheck.score = 0; - - /* Clear head and subunit states on seqno movement */ - acthd = 0; - - memset(&engine->hangcheck.instdone, 0, - sizeof(engine->hangcheck.instdone)); + hangcheck_load_sample(engine, hc); + hangcheck_accumulate_sample(engine, hc); + hangcheck_store_sample(engine, hc); + + if (hc->score >= HANGCHECK_SCORE_RING_HUNG) { + hung |= intel_engine_flag(engine); + if (hc->action != HANGCHECK_HUNG) + stuck |= intel_engine_flag(engine); } - engine->hangcheck.seqno = seqno; - engine->hangcheck.acthd = acthd; busy_count += busy; } - if (hung) { - char msg[80]; - unsigned int tmp; - int len; - - /* If some rings hung but others were still busy, only - * blame the hanging rings in the synopsis. - */ - if (stuck != hung) - hung &= ~stuck; - len = scnprintf(msg, sizeof(msg), - "%s on ", stuck == hung ? "No progress" : "Hang"); - for_each_engine_masked(engine, dev_priv, hung, tmp) - len += scnprintf(msg + len, sizeof(msg) - len, - "%s, ", engine->name); - msg[len-2] = '\0'; - - return i915_handle_error(dev_priv, hung, msg); - } + if (hung) + hangcheck_declare_hang(dev_priv, hung, stuck); /* Reset timer in case GPU hangs without another request being added */ if (busy_count) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 3466b4e..3152b2b 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -67,7 +67,9 @@ struct intel_hw_status_page { enum intel_engine_hangcheck_action { HANGCHECK_IDLE = 0, HANGCHECK_WAIT, - HANGCHECK_ACTIVE, + HANGCHECK_ACTIVE_SEQNO, + HANGCHECK_ACTIVE_HEAD, + HANGCHECK_ACTIVE_SUBUNITS, HANGCHECK_KICK, HANGCHECK_HUNG, }; -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx