You're missing a digit in the patch subject. On Wed, Nov 29, 2023 at 12:51:22PM -0800, Matt Atwood wrote: > Wa_14011274333 applies to gen 12.0->12.55. There are regions of > registers that restore to default values on resume from rc6. The It would be more clear to write this sentence as "The TDL power context is not properly restored on RC6 exit, causing registers in the context to revert to their hardware default values on RC6 exit." > hardware has a built in register (CTX_WA_PTR), that can point to a > region of memory full of commands to restore non default values on rc6 > resume. > > Based off patch by Tilak Tangudu. > > v2: Use correct lineage number, more generically apply workarounds for > all registers impacted, move workaround to gt/intel_workarounds.c > (MattR) > > v3: Correctly use intel_engine_cs, intel_engine_regs, use > reg_in_range_table() for wa registers impacted search, move the majority > of functional changes into intel_workarounds.c, free up pinned memory on > engine tear down, be more verbose in commit message and wa comments, use > generic function for platforms applied to. (MattR) > > Signed-off-by: Matt Atwood <matthew.s.atwood@xxxxxxxxx> > --- > drivers/gpu/drm/i915/gt/intel_engine_cs.c | 2 + > drivers/gpu/drm/i915/gt/intel_engine_regs.h | 4 + > drivers/gpu/drm/i915/gt/intel_engine_types.h | 3 + > drivers/gpu/drm/i915/gt/intel_workarounds.c | 114 +++++++++++++++++++ > drivers/gpu/drm/i915/gt/intel_workarounds.h | 1 + > 5 files changed, 124 insertions(+) > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > index 40687806d22a6..d3628462bfb3e 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > @@ -1573,6 +1573,8 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine) > intel_wa_list_free(&engine->ctx_wa_list); > intel_wa_list_free(&engine->wa_list); > intel_wa_list_free(&engine->whitelist); > + > + intel_ctx_wa_bb_fini(engine); > } > > /** > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h b/drivers/gpu/drm/i915/gt/intel_engine_regs.h > index a8eac59e37793..becb466f5910d 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_regs.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine_regs.h > @@ -251,6 +251,10 @@ > #define GEN11_VECS_SFC_USAGE(base) _MMIO((base) + 0x2014) > #define GEN11_VECS_SFC_USAGE_BIT (1 << 0) > > +#define CTX_WA_PTR(base) _MMIO((base) + 0x2058) This isn't the right offset; you've accidentally pre-added the RCS base. The definition should be (base + 0x58). > +#define CTX_WA_PTR_ADDR_MASK REG_GENMASK(31, 12) > +#define CTX_WA_VALID REG_BIT(0) > + > #define RING_HWS_PGA_GEN6(base) _MMIO((base) + 0x2080) > > #define GEN12_HCP_SFC_LOCK_STATUS(base) _MMIO((base) + 0x2914) > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h > index 960e6be2042fe..95ff3e1adf11e 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > @@ -388,6 +388,9 @@ struct intel_engine_cs { > u32 context_size; > u32 mmio_base; > > + u32 *ctx_wa_bb; > + struct i915_vma *vma; You should pick a more descriptive name here. Otherwise nobody will know what this is a vma for. > + > struct intel_engine_tlb_inv tlb_inv; > > /* > diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c > index 9bc0654efdc0c..1d33555039725 100644 > --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c > +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c > @@ -5,6 +5,8 @@ > > #include "i915_drv.h" > #include "i915_reg.h" > +#include "i915_perf.h" > +#include "gem/i915_gem_internal.h" > #include "intel_context.h" > #include "intel_engine_pm.h" > #include "intel_engine_regs.h" > @@ -14,6 +16,7 @@ > #include "intel_gt_print.h" > #include "intel_gt_regs.h" > #include "intel_ring.h" > +#include "intel_uncore.h" > #include "intel_workarounds.h" > > /** > @@ -2985,6 +2988,105 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li > } > } > > +static const struct i915_range tdl_power_ctx[] = { > + { .start = 0xe100, .end = 0xe100 }, > + { .start = 0xe180, .end = 0xe194 }, > + {}, > +}; > + > +static void > +emit_ctx_wa_bb(struct intel_engine_cs *engine, struct i915_wa_list *wal) > +{ > + const struct i915_wa *wa; > + struct intel_uncore *uncore = engine->gt->uncore; > + int i, count = 0; > + u32 *cs = engine->ctx_wa_bb; > + > + *cs++ = MI_LOAD_REGISTER_IMM(1); I think you meant to come back and patch up the length of the instruction after emitting however many registers we find, right? It looks like you forgot that part. > + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { > + if (reg_in_range_table(i915_mmio_reg_offset(wa->mcr_reg), tdl_power_ctx)) { > + *cs++ = i915_mmio_reg_offset(wa->mcr_reg); > + *cs++ = wa->set; Part of the reason this is so easy to handle is because all of the registers in the TDL power context are masked, so we don't have to worry about clobbering unrelated bits when re-programming the workaround. You might want to add a comment noting that so that it doesn't look like we're just reprogramming the whole register here. > + count++; > + } > + } > + *cs++ = MI_BATCH_BUFFER_END; > + > + if (count != 0) { > + i915_gem_object_flush_map(engine->vma->obj); > + intel_uncore_write(uncore, CTX_WA_PTR(engine->mmio_base), > + REG_FIELD_PREP(CTX_WA_PTR_ADDR_MASK, > + i915_vma_offset(engine->vma) & GENMASK(31, 12)) | > + CTX_WA_VALID); CTX_WA_PTR gets cleared on render domain reset, so you need to make sure it gets re-applied properly as well. You should probably do this actual register write in intel_engine_apply_workarounds() (to make sure it's re-applied on the non-GuC platforms) and also add this register to the guc ADS regset so that the GuC will save/restore it around resets on ADL-P. If it turned out you didn't actually need the batchbuffer, you could release its object here and clear the vma reference in the engine. The presence/absence of that pointer can be used as the condition to determine whether the register should be written in intel_engine_apply_workarounds(). > + } > +} > + > +static int ctx_wa_bb_init(struct intel_engine_cs *engine) > +{ > + struct drm_i915_private *i915 = engine->i915; > + struct intel_gt *gt = engine->gt; > + struct drm_i915_gem_object *obj; > + struct i915_vma *vma; > + void *batch; > + struct i915_gem_ww_ctx ww; > + int err; > + > + obj = i915_gem_object_create_internal(i915, PAGE_SIZE); > + if (IS_ERR(obj)) > + return PTR_ERR(obj); > + > + vma = i915_vma_instance(obj, >->ggtt->vm, NULL); > + if (IS_ERR(vma)) { > + err = PTR_ERR(vma); > + goto err; > + } > + > + engine->vma = vma; > + i915_gem_ww_ctx_init(&ww, true); > + > +retry: > + err = i915_gem_object_lock(obj, &ww); > + if (!err) > + err = i915_ggtt_pin(engine->vma, &ww, 0, PIN_HIGH); > + if (err) > + goto err_ww_fini; > + > + batch = i915_gem_object_pin_map(obj, I915_MAP_WB); > + if (IS_ERR(batch)) { > + err = PTR_ERR(batch); > + goto err_unpin; > + } > + engine->ctx_wa_bb = batch; > + > + return 0; Aren't we still holding locks in the wait-wound context? > + > +err_unpin: > + i915_vma_unpin(engine->vma); > + > +err_ww_fini: > + if (err == -EDEADLK) { > + err = i915_gem_ww_ctx_backoff(&ww); > + if (!err) > + goto retry; > + } > + > + i915_gem_ww_ctx_fini(&ww); > + i915_vma_put(engine->vma); > + > +err: > + i915_gem_object_put(obj); > + return err; > +} > + > +void intel_ctx_wa_bb_fini(struct intel_engine_cs *engine) > +{ > + i915_vma_unpin_and_release(&engine->vma, I915_VMA_RELEASE_MAP); You might want to clear the vma pointer as well to allow this function to be used in more places, as noted above. > +} > + > +#define NEEDS_CTX_WABB(engine) ( \ It might be better to put the name of the workaround in the macro here. There are lots of potential reasons why we might want to execute a batch on RC6 exit; this workaround is just one specific reason, but there may be others in the future. > + IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 0), IP_VER(12, 55)) && \ This isn't needed on 12.55. It was a pre-production-only workaround for DG2-G10, and it never applied to DG2-G11 or DG2-G12. So 12.00 - 12.10 would be the proper range. Note that if you did need to program this on DG2, then you would have also needed to take Wa_18018699957 into account, which involves switching to use BB_PER_CTX_PTR instead of CTX_WA_PTR. > + engine->class == RENDER_CLASS) > + > static void > engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal) > { > @@ -3007,6 +3109,18 @@ engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal > rcs_engine_wa_init(engine, wal); > else > xcs_engine_wa_init(engine, wal); > + > + /* Wa_14011274333 > + * After the workaround list has been populated some platforms have > + * regions of addresses that do not retain their values after resuming > + * from rc6. For these platforms, add all workarounds in these regions > + * to a batch buffer, this batch buffer is stored in memory and > + * executes on every rc6 resume. > + */ > + if (NEEDS_CTX_WABB(engine)) { > + ctx_wa_bb_init(engine); This function returns errors, but you're not handling them here, so failure will be silently ignored. Matt > + emit_ctx_wa_bb(engine, wal); > + } > } > > void intel_engine_init_workarounds(struct intel_engine_cs *engine) > diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.h b/drivers/gpu/drm/i915/gt/intel_workarounds.h > index 9beaab77c7f0b..fe8946b0c7b67 100644 > --- a/drivers/gpu/drm/i915/gt/intel_workarounds.h > +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.h > @@ -36,4 +36,5 @@ void intel_engine_apply_workarounds(struct intel_engine_cs *engine); > int intel_engine_verify_workarounds(struct intel_engine_cs *engine, > const char *from); > > +void intel_ctx_wa_bb_fini(struct intel_engine_cs *engine); > #endif > -- > 2.40.1 > -- Matt Roper Graphics Software Engineer Linux GPU Platform Enablement Intel Corporation