Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > Quoting Mika Kuoppala (2018-08-10 15:00:36) >> If engine reports that it is not ready for reset, we >> give up. Evidence shows that forcing a per engine reset >> on an engine which is not reporting to be ready for reset, >> can bring it back into a working order. There is risk that >> we corrupt the context image currently executing on that >> engine. But that is a risk worth taking as if we unblock >> the engine, we prevent a whole device wedging in a case >> of full gpu reset. >> >> Reset individual engine even if it reports that it is not >> prepared for reset, but only if we aim for full gpu reset >> and not on first reset attempt. >> >> v2: force reset only on later attempts, readability (Chris) >> >> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> >> Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> >> --- >> drivers/gpu/drm/i915/intel_uncore.c | 49 +++++++++++++++++++++++------ >> 1 file changed, 39 insertions(+), 10 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c >> index 027d14574bfa..d24026839b17 100644 >> --- a/drivers/gpu/drm/i915/intel_uncore.c >> +++ b/drivers/gpu/drm/i915/intel_uncore.c >> @@ -2099,9 +2099,6 @@ static int gen8_reset_engine_start(struct intel_engine_cs *engine) >> RESET_CTL_READY_TO_RESET, >> 700, 0, >> NULL); >> - if (ret) >> - DRM_ERROR("%s: reset request timeout\n", engine->name); >> - >> return ret; >> } >> >> @@ -2113,6 +2110,42 @@ static void gen8_reset_engine_cancel(struct intel_engine_cs *engine) >> _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); >> } >> >> +static int reset_engines(struct drm_i915_private *i915, >> + unsigned int engine_mask, >> + unsigned int retry) >> +{ >> + if (INTEL_GEN(i915) >= 11) >> + return gen11_reset_engines(i915, engine_mask); >> + else >> + return gen6_reset_engines(i915, engine_mask, retry); >> +} >> + >> +static int gen8_prepare_engine_for_reset(struct intel_engine_cs *engine, >> + unsigned int retry) >> +{ >> + const bool force_reset_on_non_ready = retry >= 1; >> + int ret; >> + >> + ret = gen8_reset_engine_start(engine); >> + >> + if (ret && force_reset_on_non_ready) { >> + /* >> + * Try to unblock a single non-ready engine by risking >> + * context corruption. >> + */ >> + ret = reset_engines(engine->i915, >> + intel_engine_flag(engine), >> + retry); >> + if (!ret) >> + ret = gen8_reset_engine_start(engine); >> + >> + DRM_ERROR("%s: reset request timeout, forcing reset (%d)\n", >> + engine->name, ret); > > This looks dubious now ;) > > After the force you then do a reset in the caller. Twice the reset for > twice the unpreparedness. It is intentional. First we make the engine unstuck and then do a full cycle with ready to reset involved. I don't know if it really matters tho. It could be that the engine is already in pristine condition after first. I considered the engine reset here to only be the hammer, and then the reset afterwards to be the validation. -Mika > >> + } >> + >> + return ret; >> +} >> + >> static int gen8_reset_engines(struct drm_i915_private *dev_priv, >> unsigned int engine_mask, >> unsigned int retry) >> @@ -2122,16 +2155,12 @@ static int gen8_reset_engines(struct drm_i915_private *dev_priv, >> int ret; >> >> for_each_engine_masked(engine, dev_priv, engine_mask, tmp) { >> - if (gen8_reset_engine_start(engine)) { >> - ret = -EIO; >> + ret = gen8_prepare_engine_for_reset(engine, retry); >> + if (ret) > > if (ret && !force_reset_unread) > >> goto not_ready; >> - } >> } >> >> - if (INTEL_GEN(dev_priv) >= 11) >> - ret = gen11_reset_engines(dev_priv, engine_mask); >> - else >> - ret = gen6_reset_engines(dev_priv, engine_mask, retry); >> + ret = reset_engines(dev_priv, engine_mask, retry); >> >> not_ready: >> for_each_engine_masked(engine, dev_priv, engine_mask, tmp) >> -- >> 2.17.1 >> _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx