On Tue, Aug 28, 2018 at 06:04:29PM +0100, Chris Wilson wrote: > During stress testing of full-ppgtt (on Baytrail at least), we found > that the invalidation around a context/mm switch was insufficient (writes > would go astray). Adding a second MI_FLUSH_DW barrier prevents this, but > it is unclear as to whether this is merely a delaying tactic or if it is > truly serialising with the TLB invalidation. Either way, it is > empirically required. > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107715 > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > Cc: Matthew Auld <matthew.william.auld@xxxxxxxxx> > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > --- > drivers/gpu/drm/i915/intel_ringbuffer.c | 101 +++++++++++------------- > 1 file changed, 47 insertions(+), 54 deletions(-) > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > index d40f55a8dc34..952b6269bab0 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > @@ -1944,40 +1944,62 @@ static void gen6_bsd_submit_request(struct i915_request *request) > intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); > } > > -static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode) > +static int emit_mi_flush_dw(struct i915_request *rq, u32 mode, u32 invflags) > { > u32 cmd, *cs; > > - cs = intel_ring_begin(rq, 4); > - if (IS_ERR(cs)) > - return PTR_ERR(cs); > + do { > + cs = intel_ring_begin(rq, 4); > + if (IS_ERR(cs)) > + return PTR_ERR(cs); > > - cmd = MI_FLUSH_DW; > + cmd = MI_FLUSH_DW; > > - /* We always require a command barrier so that subsequent > - * commands, such as breadcrumb interrupts, are strictly ordered > - * wrt the contents of the write cache being flushed to memory > - * (and thus being coherent from the CPU). > - */ > - cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; > + /* > + * We always require a command barrier so that subsequent > + * commands, such as breadcrumb interrupts, are strictly ordered > + * wrt the contents of the write cache being flushed to memory > + * (and thus being coherent from the CPU). > + */ > + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; > > - /* > - * Bspec vol 1c.5 - video engine command streamer: > - * "If ENABLED, all TLBs will be invalidated once the flush > - * operation is complete. This bit is only valid when the > - * Post-Sync Operation field is a value of 1h or 3h." > - */ > - if (mode & EMIT_INVALIDATE) > - cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD; > + /* > + * Bspec vol 1c.3 - blitter engine command streamer: > + * "If ENABLED, all TLBs will be invalidated once the flush > + * operation is complete. This bit is only valid when the > + * Post-Sync Operation field is a value of 1h or 3h." > + */ > + if (mode & EMIT_INVALIDATE) > + cmd |= invflags; > + *cs++ = cmd; > + *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; > + *cs++ = 0; > + *cs++ = MI_NOOP; > + intel_ring_advance(rq, cs); > + > + /* > + * Not only do we need a full barrier (post-sync write) after > + * invalidating the TLBs, but we need to wait a little bit > + * longer. Whether this is merely delaying us, or the > + * subsequent flush is a key part of serialising with the > + * post-sync op, this extra pass appears vital before a > + * mm switch! > + */ > + if (!(mode & EMIT_INVALIDATE)) > + break; > + > + mode &= ~EMIT_INVALIDATE; > + } while (1); I find the loop thingy somewhat hard to read. I'd probably have written it as something like { if (mode & EMIT_INVALIDATE) mi_flush(INVALIDATE_TLB); mi_flush(0); } Either way it seems to do what it says so Reviewed-by: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > > - *cs++ = cmd; > - *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; > - *cs++ = 0; > - *cs++ = MI_NOOP; > - intel_ring_advance(rq, cs); > return 0; > } > > +static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode) > +{ > + return emit_mi_flush_dw(rq, mode, > + MI_INVALIDATE_TLB | MI_INVALIDATE_BSD); > +} > + > static int > hsw_emit_bb_start(struct i915_request *rq, > u64 offset, u32 len, > @@ -2022,36 +2044,7 @@ gen6_emit_bb_start(struct i915_request *rq, > > static int gen6_ring_flush(struct i915_request *rq, u32 mode) > { > - u32 cmd, *cs; > - > - cs = intel_ring_begin(rq, 4); > - if (IS_ERR(cs)) > - return PTR_ERR(cs); > - > - cmd = MI_FLUSH_DW; > - > - /* We always require a command barrier so that subsequent > - * commands, such as breadcrumb interrupts, are strictly ordered > - * wrt the contents of the write cache being flushed to memory > - * (and thus being coherent from the CPU). > - */ > - cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; > - > - /* > - * Bspec vol 1c.3 - blitter engine command streamer: > - * "If ENABLED, all TLBs will be invalidated once the flush > - * operation is complete. This bit is only valid when the > - * Post-Sync Operation field is a value of 1h or 3h." > - */ > - if (mode & EMIT_INVALIDATE) > - cmd |= MI_INVALIDATE_TLB; > - *cs++ = cmd; > - *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; > - *cs++ = 0; > - *cs++ = MI_NOOP; > - intel_ring_advance(rq, cs); > - > - return 0; > + return emit_mi_flush_dw(rq, mode, MI_INVALIDATE_TLB); > } > > static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv, > -- > 2.18.0 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx -- Ville Syrjälä Intel _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx