Quoting Mika Kuoppala (2020-04-17 15:44:27) > Restoration of a previous timestamp can collide > with updating the timestamp, causing a value corruption. > > Combat this issue by using indirect ctx bb to > modify the context image during restoring process. > > For render engine, we can preload value into > scratch register. From which we then do the actual > write with LRR. LRR is faster and thus less error prone. > For other engines, no scratch is available so we > must do a more complex sequence of sync and async LRMs. > As the LRM is slower, the probablity of racy write > raises and thus we still see corruption sometimes. > > References: HSDES#16010904313 > Testcase: igt/i915_selftest/gt_lrc > Suggested-by: Joseph Koston <joseph.koston@xxxxxxxxx> > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Signed-off-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx> > > bug on fix > --- > drivers/gpu/drm/i915/gt/intel_context_types.h | 3 + > drivers/gpu/drm/i915/gt/intel_gpu_commands.h | 3 +- > drivers/gpu/drm/i915/gt/intel_lrc.c | 196 ++++++++++++++---- > drivers/gpu/drm/i915/gt/intel_lrc_reg.h | 1 + > 4 files changed, 165 insertions(+), 38 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h > index 07cb83a0d017..c7573d565f58 100644 > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > @@ -70,6 +70,9 @@ struct intel_context { > > u32 *lrc_reg_state; > u64 lrc_desc; > + > + u32 ctx_bb_offset; > + > u32 tag; /* cookie passed to HW to track this context on submission */ > > /* Time on GPU as tracked by the hw. */ > diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h > index f04214a54f75..0c2adb4078a7 100644 > --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h > +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h > @@ -138,7 +138,7 @@ > */ > #define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) > /* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */ > -#define MI_LRI_CS_MMIO (1<<19) > +#define MI_LRI_LRM_CS_MMIO (1<<19) > #define MI_LRI_FORCE_POSTED (1<<12) > #define MI_LOAD_REGISTER_IMM_MAX_REGS (126) > #define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1) > @@ -155,6 +155,7 @@ > #define MI_FLUSH_DW_USE_PPGTT (0<<2) > #define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) > #define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2) > +#define MI_LRM_ASYNC (1<<21) > #define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 1) > #define MI_BATCH_BUFFER MI_INSTR(0x30, 1) > #define MI_BATCH_NON_SECURE (1) > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > index 6fbad5e2343f..531884b9050c 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > @@ -234,7 +234,7 @@ static void execlists_init_reg_state(u32 *reg_state, > const struct intel_ring *ring, > bool close); > static void > -__execlists_update_reg_state(const struct intel_context *ce, > +__execlists_update_reg_state(struct intel_context *ce, > const struct intel_engine_cs *engine, > u32 head); > > @@ -537,7 +537,7 @@ static void set_offsets(u32 *regs, > if (flags & POSTED) > *regs |= MI_LRI_FORCE_POSTED; > if (INTEL_GEN(engine->i915) >= 11) > - *regs |= MI_LRI_CS_MMIO; > + *regs |= MI_LRI_LRM_CS_MMIO; > regs++; > > GEM_BUG_ON(!count); > @@ -3142,8 +3142,152 @@ static void execlists_context_unpin(struct intel_context *ce) > i915_gem_object_unpin_map(ce->state->obj); > } > > +static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) > +{ > + u32 indirect_ctx_offset; > + > + switch (INTEL_GEN(engine->i915)) { > + default: > + MISSING_CASE(INTEL_GEN(engine->i915)); > + fallthrough; > + case 12: > + indirect_ctx_offset = > + GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > + break; > + case 11: > + indirect_ctx_offset = > + GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > + break; > + case 10: > + indirect_ctx_offset = > + GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > + break; > + case 9: > + indirect_ctx_offset = > + GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > + break; > + case 8: > + indirect_ctx_offset = > + GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > + break; > + } > + > + return indirect_ctx_offset; > +} > + > +static u32 * > +gen12_emit_timestamp_wa_lrm(struct intel_context *ce, u32 *cs) > +{ > + const u32 lrc_offset = i915_ggtt_offset(ce->state) + > + LRC_STATE_PN * PAGE_SIZE; > + const u32 lrm = MI_LOAD_REGISTER_MEM_GEN8 | > + MI_SRM_LRM_GLOBAL_GTT | MI_LRI_LRM_CS_MMIO; > + > + *cs++ = lrm; > + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); > + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32); > + *cs++ = 0; > + > + *cs++ = lrm | MI_LRM_ASYNC; > + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); > + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32); > + *cs++ = 0; > + > + *cs++ = lrm | MI_LRM_ASYNC; > + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); > + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32); > + *cs++ = 0; > + > + *cs++ = lrm; > + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); > + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32); > + *cs++ = 0; > + > + return cs; > +} > + > +static u32 * > +gen12_emit_timestamp_wa_lrr(struct intel_context *ce, u32 *cs) > +{ > + const u32 lrc_offset = i915_ggtt_offset(ce->state) + > + LRC_STATE_PN * PAGE_SIZE; Repetitive. > + const u32 lrm = MI_LOAD_REGISTER_MEM_GEN8 | > + MI_SRM_LRM_GLOBAL_GTT | MI_LRI_LRM_CS_MMIO; Used once, might as just use the constant inplace. > + const u32 scratch_reg = 0x2198; > + > + *cs++ = lrm; > + *cs++ = scratch_reg; > + *cs++ = lrc_offset + CTX_TIMESTAMP * sizeof(u32); > + *cs++ = 0; > + > + *cs++ = MI_LOAD_REGISTER_REG | MI_LRI_LRM_CS_MMIO; > + *cs++ = scratch_reg; > + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); > + > + *cs++ = MI_LOAD_REGISTER_REG | MI_LRI_LRM_CS_MMIO; > + *cs++ = scratch_reg; > + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); > + > + return cs; > +} > + > +static u32 * > +execlists_emit_ctx_bb(struct intel_context *ce, > + u32 *(*emit)(struct intel_context *, u32 *)) > +{ > + u32 *cs = (void *)(ce->lrc_reg_state) - > + LRC_STATE_PN * PAGE_SIZE + ce->ctx_bb_offset; static u32 *context_indirect_bb(struct intel_context *ce) { void *ptr; GEM_BUG_ON(!intel_context_is_pinned(ce)); GEM_BUG_ON(!ce->ctx_bb_offset); ptr = ce->lrc_reg_state; ptr -= LRC_STATE_PN * PAGE_SIZE; /* back to start of context image */ ptr += ce->ctx_bb_offset; return ptr; } > + const u32 * const batch_start = cs; > + > + GEM_DEBUG_BUG_ON(!ce->ctx_bb_offset); > + > + cs = emit(ce, cs); > + > + GEM_DEBUG_BUG_ON(cs - batch_start > > + (I915_GTT_PAGE_SIZE - 4)/sizeof(*cs)); > + > + return cs; > +} > + > +static void > +execlists_setup_indirect_ctx_bb(struct intel_context *ce, > + u32 *(*emit)(struct intel_context *, u32 *)) > +{ > + const u32 indirect_ptr_offset = > + INTEL_GEN(ce->engine->i915) >= 12 ? > + GEN12_CTX_BB_PER_CTX_PTR + 2 : > + CTX_BB_PER_CTX_PTR + 2; Do we have this in live_lrc_layout yet? > + const u32 * const start = (void *)(ce->lrc_reg_state) - > + LRC_STATE_PN * PAGE_SIZE + ce->ctx_bb_offset; > + u32 *cs; > + > + cs = execlists_emit_ctx_bb(ce, emit); > + > + while ((unsigned long)cs % CACHELINE_BYTES) > + *cs++ = MI_NOOP; > + > + ce->lrc_reg_state[indirect_ptr_offset] = > + (i915_ggtt_offset(ce->state) + ce->ctx_bb_offset) | > + (cs - start) * sizeof(*cs) / > + CACHELINE_BYTES; > + > + ce->lrc_reg_state[indirect_ptr_offset + 2] = > + intel_lr_indirect_ctx_offset(ce->engine) << 6; > +} > + > static void > -__execlists_update_reg_state(const struct intel_context *ce, > +gen12_setup_timestamp_ctx_wa(struct intel_context *ce) > +{ > + if (ce->engine->class == RENDER_CLASS) > + execlists_setup_indirect_ctx_bb(ce, I'd be tempted to drop the execlists_ at this depth. > + gen12_emit_timestamp_wa_lrr); > + else > + execlists_setup_indirect_ctx_bb(ce, > + gen12_emit_timestamp_wa_lrm); u32 *(*fn)(struct intel_context *ce, u32 *cs); fn = gen12_emit_timestamp_wa_lrm; if (ce->engine->class == RENDER_CLASS) fn = gen12_emit_timestamp_wa_lrr; setup_indirect_ctx_bb(ce, fn); > +static u32 * > +gen12_emit_timestamp_wa_lrr(struct intel_context *ce, u32 *cs) > +} > + > +static void > +__execlists_update_reg_state(struct intel_context *ce, > const struct intel_engine_cs *engine, > u32 head) > { > @@ -3164,6 +3308,13 @@ __execlists_update_reg_state(const struct intel_context *ce, > intel_sseu_make_rpcs(engine->i915, &ce->sseu); > > i915_oa_init_reg_state(ce, engine); > + > + } > + > + if (ce->ctx_bb_offset) { > + /* Mutually exclusive wrt to global indirect bb */ > + GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); > + gen12_setup_timestamp_ctx_wa(ce); > } > } > > @@ -4667,40 +4818,6 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine) > return 0; > } > > -static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) > -{ > - u32 indirect_ctx_offset; > - > - switch (INTEL_GEN(engine->i915)) { > - default: > - MISSING_CASE(INTEL_GEN(engine->i915)); > - /* fall through */ > - case 12: > - indirect_ctx_offset = > - GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > - break; > - case 11: > - indirect_ctx_offset = > - GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > - break; > - case 10: > - indirect_ctx_offset = > - GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > - break; > - case 9: > - indirect_ctx_offset = > - GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > - break; > - case 8: > - indirect_ctx_offset = > - GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; > - break; > - } > - > - return indirect_ctx_offset; > -} > - > - > static void init_common_reg_state(u32 * const regs, > const struct intel_engine_cs *engine, > const struct intel_ring *ring, > @@ -4867,6 +4984,11 @@ static int __execlists_context_alloc(struct intel_context *ce, > if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) > context_size += I915_GTT_PAGE_SIZE; /* for redzone */ > > + if (INTEL_GEN(engine->i915) == 12) { > + ce->ctx_bb_offset = context_size; > + context_size += PAGE_SIZE; > + } > + > ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); > if (IS_ERR(ctx_obj)) > return PTR_ERR(ctx_obj); > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h > index d39b72590e40..bb444614f33b 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h > +++ b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h > @@ -32,6 +32,7 @@ > > /* GEN12+ Reg State Context */ > #define GEN12_CTX_BB_PER_CTX_PTR (0x12 + 1) > +#define CTX_BB_PER_CTX_PTR_VALID BIT(0) > > #define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \ > u32 *reg_state__ = (reg_state); \ Was I imagining things, but didn't you devise a selftest to prove we could use the indirect-ctx-bb to do interesting things? -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx