On Fri, Aug 22, 2014 at 08:39:11PM +0100, Arun Siluvery wrote: > For BDW workarounds are currently initialized in init_clock_gating() but > they are lost during reset, suspend/resume etc; this patch moves the WAs > that are part of register state context to render ring init fn otherwise > default context ends up with incorrect values as they don't get initialized > until init_clock_gating fn. > > v2: Add workarounds to golden render state > This method has its own issues, first of all this is different for > each gen and it is generated using a tool so adding new workaround > and mainitaining them across gens is not a straightforward process. > > v3: Use LRIs to emit these workarounds (Ville) > Instead of modifying the golden render state the same LRIs are > emitted from within the driver. > > For: VIZ-4092 > Signed-off-by: Arun Siluvery <arun.siluvery@xxxxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_gem_context.c | 6 +++ > drivers/gpu/drm/i915/intel_pm.c | 48 ---------------------- > drivers/gpu/drm/i915/intel_ringbuffer.c | 70 +++++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/intel_ringbuffer.h | 7 ++++ > 4 files changed, 83 insertions(+), 48 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c > index 9683e62..2debce4 100644 > --- a/drivers/gpu/drm/i915/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/i915_gem_context.c > @@ -631,20 +631,26 @@ static int do_switch(struct intel_engine_cs *ring, > } > > uninitialized = !to->legacy_hw_ctx.initialized && from == NULL; > to->legacy_hw_ctx.initialized = true; > > done: > i915_gem_context_reference(to); > ring->last_context = to; > > if (uninitialized) { > + if (IS_BROADWELL(ring->dev)) { > + ret = bdw_init_workarounds(ring); > + if (ret) > + DRM_ERROR("init workarounds: %d\n", ret); > + } > + > ret = i915_gem_render_state_init(ring); > if (ret) > DRM_ERROR("init render state: %d\n", ret); > } > > return 0; > > unpin_out: > if (ring->id == RCS) > i915_gem_object_ggtt_unpin(to->legacy_hw_ctx.rcs_state); > diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c > index c8f744c..668acd9 100644 > --- a/drivers/gpu/drm/i915/intel_pm.c > +++ b/drivers/gpu/drm/i915/intel_pm.c > @@ -5507,101 +5507,53 @@ static void gen8_init_clock_gating(struct drm_device *dev) > struct drm_i915_private *dev_priv = dev->dev_private; > enum pipe pipe; > > I915_WRITE(WM3_LP_ILK, 0); > I915_WRITE(WM2_LP_ILK, 0); > I915_WRITE(WM1_LP_ILK, 0); > > /* FIXME(BDW): Check all the w/a, some might only apply to > * pre-production hw. */ > > - /* WaDisablePartialInstShootdown:bdw */ > - I915_WRITE(GEN8_ROW_CHICKEN, > - _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE)); > - > - /* WaDisableThreadStallDopClockGating:bdw */ > - /* FIXME: Unclear whether we really need this on production bdw. */ > - I915_WRITE(GEN8_ROW_CHICKEN, > - _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); > > - /* > - * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for > - * pre-production hardware > - */ > - I915_WRITE(HALF_SLICE_CHICKEN3, > - _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS)); > - I915_WRITE(HALF_SLICE_CHICKEN3, > - _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS)); > I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE)); > > I915_WRITE(_3D_CHICKEN3, > _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2))); > > - I915_WRITE(COMMON_SLICE_CHICKEN2, > - _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE)); > - > - I915_WRITE(GEN7_HALF_SLICE_CHICKEN1, > - _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE)); > - > - /* WaDisableDopClockGating:bdw May not be needed for production */ > - I915_WRITE(GEN7_ROW_CHICKEN2, > - _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); > > /* WaSwitchSolVfFArbitrationPriority:bdw */ > I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL); > > /* WaPsrDPAMaskVBlankInSRD:bdw */ > I915_WRITE(CHICKEN_PAR1_1, > I915_READ(CHICKEN_PAR1_1) | DPA_MASK_VBLANK_SRD); > > /* WaPsrDPRSUnmaskVBlankInSRD:bdw */ > for_each_pipe(pipe) { > I915_WRITE(CHICKEN_PIPESL_1(pipe), > I915_READ(CHICKEN_PIPESL_1(pipe)) | > BDW_DPRS_MASK_VBLANK_SRD); > } > > - /* Use Force Non-Coherent whenever executing a 3D context. This is a > - * workaround for for a possible hang in the unlikely event a TLB > - * invalidation occurs during a PSD flush. > - */ > - I915_WRITE(HDC_CHICKEN0, > - I915_READ(HDC_CHICKEN0) | > - _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT)); > - > /* WaVSRefCountFullforceMissDisable:bdw */ > /* WaDSRefCountFullforceMissDisable:bdw */ > I915_WRITE(GEN7_FF_THREAD_MODE, > I915_READ(GEN7_FF_THREAD_MODE) & > ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME)); > > - /* > - * BSpec recommends 8x4 when MSAA is used, > - * however in practice 16x4 seems fastest. > - * > - * Note that PS/WM thread counts depend on the WIZ hashing > - * disable bit, which we don't touch here, but it's good > - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). > - */ > - I915_WRITE(GEN7_GT_MODE, > - GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4); > - > I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL, > _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE)); > > /* WaDisableSDEUnitClockGating:bdw */ > I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) | > GEN8_SDEUNIT_CLOCK_GATE_DISABLE); > - > - /* Wa4x4STCOptimizationDisable:bdw */ > - I915_WRITE(CACHE_MODE_1, > - _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE)); > } > > static void haswell_init_clock_gating(struct drm_device *dev) > { > struct drm_i915_private *dev_priv = dev->dev_private; > > ilk_init_lp_watermarks(dev); > > /* L3 caching of data atomics doesn't work -- disable it. */ > I915_WRITE(HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE); > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index 13543f8..9e24073 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -643,20 +643,90 @@ intel_init_pipe_control(struct intel_engine_cs *ring) > return 0; > > err_unpin: > i915_gem_object_ggtt_unpin(ring->scratch.obj); > err_unref: > drm_gem_object_unreference(&ring->scratch.obj->base); > err: > return ret; > } > > +int bdw_init_workarounds(struct intel_engine_cs *ring) > +{ > + int ret; > + > + /* > + * workarounds applied in this fn are part of register state context, > + * they need to be re-initialized followed by gpu reset, suspend/resume, > + * module reload. > + */ > + > + /* > + * update the number of dwords required based on the > + * actual number of workarounds applied > + */ > + ret = intel_ring_begin(ring, 24); > + if (ret) > + return ret; > + > + /* WaDisablePartialInstShootdown:bdw */ > + /* WaDisableThreadStallDopClockGating:bdw */ > + /* FIXME: Unclear whether we really need this on production bdw. */ > + INTEL_RING_EMIT_WA(ring, GEN8_ROW_CHICKEN, > + _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE > + | STALL_DOP_GATING_DISABLE)); > + > + /* WaDisableDopClockGating:bdw May not be needed for production */ > + INTEL_RING_EMIT_WA(ring, GEN7_ROW_CHICKEN2, > + _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); > + > + /* > + * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for > + * pre-production hardware > + */ > + INTEL_RING_EMIT_WA(ring, HALF_SLICE_CHICKEN3, > + _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS > + | GEN8_SAMPLER_POWER_BYPASS_DIS)); > + > + INTEL_RING_EMIT_WA(ring, GEN7_HALF_SLICE_CHICKEN1, > + _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE)); > + > + INTEL_RING_EMIT_WA(ring, COMMON_SLICE_CHICKEN2, > + _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE)); > + > + /* Use Force Non-Coherent whenever executing a 3D context. This is a > + * workaround for for a possible hang in the unlikely event a TLB > + * invalidation occurs during a PSD flush. > + */ > + INTEL_RING_EMIT_WA(ring, HDC_CHICKEN0, > + _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT)); > + > + /* Wa4x4STCOptimizationDisable:bdw */ > + INTEL_RING_EMIT_WA(ring, CACHE_MODE_1, > + _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE)); > + > + /* > + * BSpec recommends 8x4 when MSAA is used, > + * however in practice 16x4 seems fastest. > + * > + * Note that PS/WM thread counts depend on the WIZ hashing > + * disable bit, which we don't touch here, but it's good > + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). > + */ > + INTEL_RING_EMIT_WA(ring, GEN7_GT_MODE, > + GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4); > + > + intel_ring_advance(ring); > + > + return 0; > +} > + > static int init_render_ring(struct intel_engine_cs *ring) > { > struct drm_device *dev = ring->dev; > struct drm_i915_private *dev_priv = dev->dev_private; > int ret = init_ring_common(ring); > if (ret) > return ret; > > /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */ > if (INTEL_INFO(dev)->gen >= 4 && INTEL_INFO(dev)->gen < 7) > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 9cbf7b0..77fe667 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -66,20 +66,26 @@ struct intel_hw_status_page { > break; \ > } \ > ring->semaphore.signal_ggtt[RCS] = GEN8_SIGNAL_OFFSET(ring, RCS); \ > ring->semaphore.signal_ggtt[VCS] = GEN8_SIGNAL_OFFSET(ring, VCS); \ > ring->semaphore.signal_ggtt[BCS] = GEN8_SIGNAL_OFFSET(ring, BCS); \ > ring->semaphore.signal_ggtt[VECS] = GEN8_SIGNAL_OFFSET(ring, VECS); \ > ring->semaphore.signal_ggtt[VCS2] = GEN8_SIGNAL_OFFSET(ring, VCS2); \ > ring->semaphore.signal_ggtt[ring->id] = MI_SEMAPHORE_SYNC_INVALID; \ > } while(0) > > +#define INTEL_RING_EMIT_WA(_ring, _wa_reg, _wa_val) ({ \ > + intel_ring_emit(_ring, MI_LOAD_REGISTER_IMM(1)); \ > + intel_ring_emit(_ring, _wa_reg); \ > + intel_ring_emit(_ring, _wa_val); \ > + }) do {} while (0) would suffice since the macro doesn't return anything. Not sure why it's a macro actually. A static function in intel_ringbuffer.c would do just as well. If you want to keep it a macro, the argument evaluations should have () around them. I was also going to say that you could just call it ring_emit_lri() or something, but I guess you plan to add the debugfs w/a stuff there which makes the current name more appropriate. I'll let others bikeshed the debugfs stuff since I have no real opinion on it. Apart from that the patch seems good to me. > + > enum intel_ring_hangcheck_action { > HANGCHECK_IDLE = 0, > HANGCHECK_WAIT, > HANGCHECK_ACTIVE, > HANGCHECK_ACTIVE_LOOP, > HANGCHECK_KICK, > HANGCHECK_HUNG, > }; > > #define HANGCHECK_SCORE_RING_HUNG 31 > @@ -411,20 +417,21 @@ int intel_ring_flush_all_caches(struct intel_engine_cs *ring); > int intel_ring_invalidate_all_caches(struct intel_engine_cs *ring); > > void intel_fini_pipe_control(struct intel_engine_cs *ring); > int intel_init_pipe_control(struct intel_engine_cs *ring); > > int intel_init_render_ring_buffer(struct drm_device *dev); > int intel_init_bsd_ring_buffer(struct drm_device *dev); > int intel_init_bsd2_ring_buffer(struct drm_device *dev); > int intel_init_blt_ring_buffer(struct drm_device *dev); > int intel_init_vebox_ring_buffer(struct drm_device *dev); > +int bdw_init_workarounds(struct intel_engine_cs *ring); > > u64 intel_ring_get_active_head(struct intel_engine_cs *ring); > void intel_ring_setup_status_page(struct intel_engine_cs *ring); > > static inline u32 intel_ring_get_tail(struct intel_ringbuffer *ringbuf) > { > return ringbuf->tail; > } > > static inline u32 intel_ring_get_seqno(struct intel_engine_cs *ring) > -- > 2.0.4 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > http://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Ville Syrjälä Intel OTC _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx