During stress testing of full-ppgtt (on Baytrail at least), we found that the invalidation around a context/mm switch was insufficient (writes would go astray). Adding a second MI_FLUSH_DW barrier prevents this, but it is unclear as to whether this is merely a delaying tactic or if it is truly serialising with the TLB invalidation. Either way, it is empirically required. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107715 Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> Cc: Matthew Auld <matthew.william.auld@xxxxxxxxx> Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> --- drivers/gpu/drm/i915/intel_ringbuffer.c | 101 +++++++++++------------- 1 file changed, 47 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index d40f55a8dc34..952b6269bab0 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -1944,40 +1944,62 @@ static void gen6_bsd_submit_request(struct i915_request *request) intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); } -static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode) +static int emit_mi_flush_dw(struct i915_request *rq, u32 mode, u32 invflags) { u32 cmd, *cs; - cs = intel_ring_begin(rq, 4); - if (IS_ERR(cs)) - return PTR_ERR(cs); + do { + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); - cmd = MI_FLUSH_DW; + cmd = MI_FLUSH_DW; - /* We always require a command barrier so that subsequent - * commands, such as breadcrumb interrupts, are strictly ordered - * wrt the contents of the write cache being flushed to memory - * (and thus being coherent from the CPU). - */ - cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; + /* + * We always require a command barrier so that subsequent + * commands, such as breadcrumb interrupts, are strictly ordered + * wrt the contents of the write cache being flushed to memory + * (and thus being coherent from the CPU). + */ + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; - /* - * Bspec vol 1c.5 - video engine command streamer: - * "If ENABLED, all TLBs will be invalidated once the flush - * operation is complete. This bit is only valid when the - * Post-Sync Operation field is a value of 1h or 3h." - */ - if (mode & EMIT_INVALIDATE) - cmd |= MI_INVALIDATE_TLB | MI_INVALIDATE_BSD; + /* + * Bspec vol 1c.3 - blitter engine command streamer: + * "If ENABLED, all TLBs will be invalidated once the flush + * operation is complete. This bit is only valid when the + * Post-Sync Operation field is a value of 1h or 3h." + */ + if (mode & EMIT_INVALIDATE) + cmd |= invflags; + *cs++ = cmd; + *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + /* + * Not only do we need a full barrier (post-sync write) after + * invalidating the TLBs, but we need to wait a little bit + * longer. Whether this is merely delaying us, or the + * subsequent flush is a key part of serialising with the + * post-sync op, this extra pass appears vital before a + * mm switch! + */ + if (!(mode & EMIT_INVALIDATE)) + break; + + mode &= ~EMIT_INVALIDATE; + } while (1); - *cs++ = cmd; - *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; - *cs++ = 0; - *cs++ = MI_NOOP; - intel_ring_advance(rq, cs); return 0; } +static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode) +{ + return emit_mi_flush_dw(rq, mode, + MI_INVALIDATE_TLB | MI_INVALIDATE_BSD); +} + static int hsw_emit_bb_start(struct i915_request *rq, u64 offset, u32 len, @@ -2022,36 +2044,7 @@ gen6_emit_bb_start(struct i915_request *rq, static int gen6_ring_flush(struct i915_request *rq, u32 mode) { - u32 cmd, *cs; - - cs = intel_ring_begin(rq, 4); - if (IS_ERR(cs)) - return PTR_ERR(cs); - - cmd = MI_FLUSH_DW; - - /* We always require a command barrier so that subsequent - * commands, such as breadcrumb interrupts, are strictly ordered - * wrt the contents of the write cache being flushed to memory - * (and thus being coherent from the CPU). - */ - cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; - - /* - * Bspec vol 1c.3 - blitter engine command streamer: - * "If ENABLED, all TLBs will be invalidated once the flush - * operation is complete. This bit is only valid when the - * Post-Sync Operation field is a value of 1h or 3h." - */ - if (mode & EMIT_INVALIDATE) - cmd |= MI_INVALIDATE_TLB; - *cs++ = cmd; - *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; - *cs++ = 0; - *cs++ = MI_NOOP; - intel_ring_advance(rq, cs); - - return 0; + return emit_mi_flush_dw(rq, mode, MI_INVALIDATE_TLB); } static void intel_ring_init_semaphores(struct drm_i915_private *dev_priv, -- 2.18.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx