Hi Mika / Daniel, below is the basic code path of a reset which has been changed by my patch: i915_reset() { .... i915_gem_reset() -> This used to call i915_gem_context_reset(), which has now been removed. ..... i915_gem_init_hw() ..... i915_gem_context_enable() -> This used to return during reset. Now it doesn't ..... for each ring, i915_switch_context(default) do_switch(); ..... ..... } " I am with Daniel on this one. I don't understand how can we throw everything in here away." Did you maybe mean Ben? Daniel, I thought you were happy with the implementation, and V2 fixed your last concern, could you please comment. " We need to force hw to switch to a working context, after reset, so that our internal state tracking matches." I believe this patch does that using i915_switch_context, rather than the hacky i915_gem_context_reset() Alistair. > -----Original Message----- > From: Mika Kuoppala [mailto:mika.kuoppala@xxxxxxxxxxxxxxx] > Sent: Wednesday, August 06, 2014 5:25 PM > To: Mcaulay, Alistair; intel-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: Re: [PATCH v2] drm/i915: Rework GPU reset sequence to > match driver load & thaw > > > Hi, > > alistair.mcaulay@xxxxxxxxx writes: > > > From: "McAulay, Alistair" <alistair.mcaulay@xxxxxxxxx> > > > > This patch is to address Daniels concerns over different code during reset: > > > > http://lists.freedesktop.org/archives/intel-gfx/2014-June/047758.html > > > > "The reason for aiming as hard as possible to use the exact same code > > for driver load, gpu reset and runtime pm/system resume is that we've > > simply seen too many bugs due to slight variations and unintended > omissions." > > > > Tested using igt drv_hangman. > > > > V2: Cleaner way of preventing check_wedge returning -EAGAIN > > > > Signed-off-by: McAulay, Alistair <alistair.mcaulay@xxxxxxxxx> > > --- > > drivers/gpu/drm/i915/i915_drv.c | 6 +++ > > drivers/gpu/drm/i915/i915_drv.h | 3 ++ > > drivers/gpu/drm/i915/i915_gem.c | 6 +-- > > drivers/gpu/drm/i915/i915_gem_context.c | 42 +-------------------- > > drivers/gpu/drm/i915/i915_gem_gtt.c | 67 +++++---------------------------- > > drivers/gpu/drm/i915/i915_gem_gtt.h | 3 +- > > 6 files changed, 23 insertions(+), 104 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_drv.c > > b/drivers/gpu/drm/i915/i915_drv.c index 5e4fefd..3bfafe6 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.c > > +++ b/drivers/gpu/drm/i915/i915_drv.c > > @@ -806,7 +806,13 @@ int i915_reset(struct drm_device *dev) > > !dev_priv->ums.mm_suspended) { > > dev_priv->ums.mm_suspended = 0; > > > > + /* Used to prevent gem_check_wedged returning -EAGAIN > during gpu reset */ > > + dev_priv->gpu_error.reload_in_reset = true; > > + > > ret = i915_gem_init_hw(dev); > > + > > + dev_priv->gpu_error.reload_in_reset = false; > > + > > mutex_unlock(&dev->struct_mutex); > > if (ret) { > > DRM_ERROR("Failed hw init on reset %d\n", ret); diff > --git > > a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > > index 991b663..116daff 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -1217,6 +1217,9 @@ struct i915_gpu_error { > > > > /* For missed irq/seqno simulation. */ > > unsigned int test_irq_rings; > > + > > + /* Used to prevent gem_check_wedged returning -EAGAIN during > gpu reset */ > > + bool reload_in_reset; > > }; > > > > enum modeset_restore { > > diff --git a/drivers/gpu/drm/i915/i915_gem.c > > b/drivers/gpu/drm/i915/i915_gem.c index ef047bc..14e1770 100644 > > --- a/drivers/gpu/drm/i915/i915_gem.c > > +++ b/drivers/gpu/drm/i915/i915_gem.c > > @@ -1085,7 +1085,9 @@ i915_gem_check_wedge(struct i915_gpu_error > *error, > > if (i915_terminally_wedged(error)) > > return -EIO; > > > > - return -EAGAIN; > > + /* Check if GPU Reset is in progress */ > > + if (!error->reload_in_reset) > > + return -EAGAIN; > > } > > > > return 0; > > @@ -2590,8 +2592,6 @@ void i915_gem_reset(struct drm_device *dev) > > for_each_ring(ring, dev_priv, i) > > i915_gem_reset_ring_cleanup(dev_priv, ring); > > > > - i915_gem_context_reset(dev); > > - > > i915_gem_restore_fences(dev); > > } > > > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c > > b/drivers/gpu/drm/i915/i915_gem_context.c > > index de72a28..d96219f 100644 > > --- a/drivers/gpu/drm/i915/i915_gem_context.c > > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > > @@ -372,42 +372,6 @@ err_destroy: > > return ERR_PTR(ret); > > } > > > > -void i915_gem_context_reset(struct drm_device *dev) -{ > > - struct drm_i915_private *dev_priv = dev->dev_private; > > - int i; > > - > > - /* Prevent the hardware from restoring the last context (which > hung) on > > - * the next switch */ > > - for (i = 0; i < I915_NUM_RINGS; i++) { > > - struct intel_engine_cs *ring = &dev_priv->ring[i]; > > - struct intel_context *dctx = ring->default_context; > > - struct intel_context *lctx = ring->last_context; > > - > > - /* Do a fake switch to the default context */ > > - if (lctx == dctx) > > - continue; > > - > > - if (!lctx) > > - continue; > > - > > - if (dctx->legacy_hw_ctx.rcs_state && i == RCS) { > > - WARN_ON(i915_gem_obj_ggtt_pin(dctx- > >legacy_hw_ctx.rcs_state, > > - > get_context_alignment(dev), 0)); > > - /* Fake a finish/inactive */ > > - dctx->legacy_hw_ctx.rcs_state->base.write_domain > = 0; > > - dctx->legacy_hw_ctx.rcs_state->active = 0; > > - } > > - > > - if (lctx->legacy_hw_ctx.rcs_state && i == RCS) > > - i915_gem_object_ggtt_unpin(lctx- > >legacy_hw_ctx.rcs_state); > > - > > - i915_gem_context_unreference(lctx); > > - i915_gem_context_reference(dctx); > > - ring->last_context = dctx; > > - } > > -} > > - > > I am with Daniel on this one. I don't understand how can we throw > everything in here away. > > We need to force hw to switch to a working context, after reset, so that our > internal state tracking matches. > > Further, if we aim to more unification I think we should make it so that the > initial render state will get run, also after reset. > > If we cleanup the last context for each ring set default context carefully, > i915_gem_context_enable() will then switch to default contexts and reinit > them using the initial render state. Something like this: > > void i915_gem_context_reset(struct drm_device *dev) { > struct drm_i915_private *dev_priv = dev->dev_private; > int i; > > for (i = 0; i < I915_NUM_RINGS; i++) { > struct intel_engine_cs *ring = &dev_priv->ring[i]; > struct intel_context *lctx = ring->last_context; > struct intel_context *dctx = ring->default_context; > > if (lctx) { > if (lctx->legacy_hw_ctx.rcs_state && i == RCS) > i915_gem_object_ggtt_unpin(lctx- > >legacy_hw_ctx.rcs_state); > > i915_gem_context_unreference(lctx); > ring->last_context = NULL; > } > > if (dctx->legacy_hw_ctx.rcs_state && i == RCS) { > dctx->legacy_hw_ctx.rcs_state->base.write_domain > = 0; > dctx->legacy_hw_ctx.rcs_state->active = 0; > dctx->legacy_hw_ctx.initialized = false; > } > } > } > > The state would be closer of what we get after module reload. > > -Mika > > > int i915_gem_context_init(struct drm_device *dev) { > > struct drm_i915_private *dev_priv = dev->dev_private; @@ -498,10 > > +462,6 @@ int i915_gem_context_enable(struct drm_i915_private > *dev_priv) > > ppgtt->enable(ppgtt); > > } > > > > - /* FIXME: We should make this work, even in reset */ > > - if (i915_reset_in_progress(&dev_priv->gpu_error)) > > - return 0; > > - > > BUG_ON(!dev_priv->ring[RCS].default_context); > > > > for_each_ring(ring, dev_priv, i) { > > @@ -645,7 +605,7 @@ static int do_switch(struct intel_engine_cs *ring, > > from = ring->last_context; > > > > if (USES_FULL_PPGTT(ring->dev)) { > > - ret = ppgtt->switch_mm(ppgtt, ring, false); > > + ret = ppgtt->switch_mm(ppgtt, ring); > > if (ret) > > goto unpin_out; > > } > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c > > b/drivers/gpu/drm/i915/i915_gem_gtt.c > > index 5188936..450c8a9 100644 > > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c > > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c > > @@ -216,19 +216,12 @@ static gen6_gtt_pte_t > iris_pte_encode(dma_addr_t > > addr, > > > > /* Broadwell Page Directory Pointer Descriptors */ static int > > gen8_write_pdp(struct intel_engine_cs *ring, unsigned entry, > > - uint64_t val, bool synchronous) > > + uint64_t val) > > { > > - struct drm_i915_private *dev_priv = ring->dev->dev_private; > > int ret; > > > > BUG_ON(entry >= 4); > > > > - if (synchronous) { > > - I915_WRITE(GEN8_RING_PDP_UDW(ring, entry), val >> 32); > > - I915_WRITE(GEN8_RING_PDP_LDW(ring, entry), (u32)val); > > - return 0; > > - } > > - > > ret = intel_ring_begin(ring, 6); > > if (ret) > > return ret; > > @@ -245,8 +238,7 @@ static int gen8_write_pdp(struct intel_engine_cs > > *ring, unsigned entry, } > > > > static int gen8_mm_switch(struct i915_hw_ppgtt *ppgtt, > > - struct intel_engine_cs *ring, > > - bool synchronous) > > + struct intel_engine_cs *ring) > > { > > int i, ret; > > > > @@ -255,7 +247,7 @@ static int gen8_mm_switch(struct i915_hw_ppgtt > > *ppgtt, > > > > for (i = used_pd - 1; i >= 0; i--) { > > dma_addr_t addr = ppgtt->pd_dma_addr[i]; > > - ret = gen8_write_pdp(ring, i, addr, synchronous); > > + ret = gen8_write_pdp(ring, i, addr); > > if (ret) > > return ret; > > } > > @@ -724,29 +716,10 @@ static uint32_t get_pd_offset(struct > > i915_hw_ppgtt *ppgtt) } > > > > static int hsw_mm_switch(struct i915_hw_ppgtt *ppgtt, > > - struct intel_engine_cs *ring, > > - bool synchronous) > > + struct intel_engine_cs *ring) > > { > > - struct drm_device *dev = ppgtt->base.dev; > > - struct drm_i915_private *dev_priv = dev->dev_private; > > int ret; > > > > - /* If we're in reset, we can assume the GPU is sufficiently idle to > > - * manually frob these bits. Ideally we could use the ring functions, > > - * except our error handling makes it quite difficult (can't use > > - * intel_ring_begin, ring->flush, or intel_ring_advance) > > - * > > - * FIXME: We should try not to special case reset > > - */ > > - if (synchronous || > > - i915_reset_in_progress(&dev_priv->gpu_error)) { > > - WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt); > > - I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G); > > - I915_WRITE(RING_PP_DIR_BASE(ring), > get_pd_offset(ppgtt)); > > - POSTING_READ(RING_PP_DIR_BASE(ring)); > > - return 0; > > - } > > - > > /* NB: TLBs must be flushed and invalidated before a switch */ > > ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, > I915_GEM_GPU_DOMAINS); > > if (ret) > > @@ -768,29 +741,10 @@ static int hsw_mm_switch(struct i915_hw_ppgtt > > *ppgtt, } > > > > static int gen7_mm_switch(struct i915_hw_ppgtt *ppgtt, > > - struct intel_engine_cs *ring, > > - bool synchronous) > > + struct intel_engine_cs *ring) > > { > > - struct drm_device *dev = ppgtt->base.dev; > > - struct drm_i915_private *dev_priv = dev->dev_private; > > int ret; > > > > - /* If we're in reset, we can assume the GPU is sufficiently idle to > > - * manually frob these bits. Ideally we could use the ring functions, > > - * except our error handling makes it quite difficult (can't use > > - * intel_ring_begin, ring->flush, or intel_ring_advance) > > - * > > - * FIXME: We should try not to special case reset > > - */ > > - if (synchronous || > > - i915_reset_in_progress(&dev_priv->gpu_error)) { > > - WARN_ON(ppgtt != dev_priv->mm.aliasing_ppgtt); > > - I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G); > > - I915_WRITE(RING_PP_DIR_BASE(ring), > get_pd_offset(ppgtt)); > > - POSTING_READ(RING_PP_DIR_BASE(ring)); > > - return 0; > > - } > > - > > /* NB: TLBs must be flushed and invalidated before a switch */ > > ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, > I915_GEM_GPU_DOMAINS); > > if (ret) > > @@ -819,14 +773,11 @@ static int gen7_mm_switch(struct i915_hw_ppgtt > > *ppgtt, } > > > > static int gen6_mm_switch(struct i915_hw_ppgtt *ppgtt, > > - struct intel_engine_cs *ring, > > - bool synchronous) > > + struct intel_engine_cs *ring) > > { > > struct drm_device *dev = ppgtt->base.dev; > > struct drm_i915_private *dev_priv = dev->dev_private; > > > > - if (!synchronous) > > - return 0; > > > > I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G); > > I915_WRITE(RING_PP_DIR_BASE(ring), get_pd_offset(ppgtt)); @@ - > 852,7 > > +803,7 @@ static int gen8_ppgtt_enable(struct i915_hw_ppgtt *ppgtt) > > if (USES_FULL_PPGTT(dev)) > > continue; > > > > - ret = ppgtt->switch_mm(ppgtt, ring, true); > > + ret = ppgtt->switch_mm(ppgtt, ring); > > if (ret) > > goto err_out; > > } > > @@ -897,7 +848,7 @@ static int gen7_ppgtt_enable(struct i915_hw_ppgtt > *ppgtt) > > if (USES_FULL_PPGTT(dev)) > > continue; > > > > - ret = ppgtt->switch_mm(ppgtt, ring, true); > > + ret = ppgtt->switch_mm(ppgtt, ring); > > if (ret) > > return ret; > > } > > @@ -926,7 +877,7 @@ static int gen6_ppgtt_enable(struct i915_hw_ppgtt > *ppgtt) > > I915_WRITE(GFX_MODE, > _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE)); > > > > for_each_ring(ring, dev_priv, i) { > > - int ret = ppgtt->switch_mm(ppgtt, ring, true); > > + int ret = ppgtt->switch_mm(ppgtt, ring); > > if (ret) > > return ret; > > } > > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h > > b/drivers/gpu/drm/i915/i915_gem_gtt.h > > index 8d6f7c1..bf1e4fc 100644 > > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h > > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h > > @@ -262,8 +262,7 @@ struct i915_hw_ppgtt { > > > > int (*enable)(struct i915_hw_ppgtt *ppgtt); > > int (*switch_mm)(struct i915_hw_ppgtt *ppgtt, > > - struct intel_engine_cs *ring, > > - bool synchronous); > > + struct intel_engine_cs *ring); > > void (*debug_dump)(struct i915_hw_ppgtt *ppgtt, struct seq_file > *m); > > }; > > > > -- > > 2.0.0 > > > > _______________________________________________ > > Intel-gfx mailing list > > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx