Re: [PATCH] drm/i915: RFC: Introduce Wa_14011282866

Matt Roper <matthew.d.roper@xxxxxxxxx> · Fri, 12 May 2023 15:47:35 -0700

On Wed, May 10, 2023 at 02:58:11PM -0700, Matt Atwood wrote:
> From: Tilak Tangudu <tilak.tangudu@xxxxxxxxx>
> 
> Wa_14011282866 applies to RKL, ADL-S, ADL-P and TGL.

Wa_14011282866 isn't a valid workaround number.

> 
> Allocate buffer pinned to GGTT and add WA to restore sampler power
> context.
> 
> Bspec: 46247
> 
> Signed-off-by: Matt Atwood <matthew.s.atwood@xxxxxxxxx>
> Signed-off-by: Tilak Tangudu <tilak.tangudu@xxxxxxxxx>

These lines are backward if Tilak is the original author as shown above.
>From Documentation/process/submitting-patches.rst:

        "Any further SoBs (Signed-off-by:'s) following the author's SoB
        are from people handling and transporting the patch, but were
        not involved in its development. SoB chains should reflect the
        **real** route a patch took as it was propagated to the
        maintainers and ultimately to Linus, with the first SoB entry
        signalling primary authorship of a single author."

> ---
>  drivers/gpu/drm/i915/gt/intel_gt_regs.h   |  5 ++
>  drivers/gpu/drm/i915/gt/intel_rc6.c       | 88 +++++++++++++++++++++++
>  drivers/gpu/drm/i915/gt/intel_rc6_types.h |  3 +
>  3 files changed, 96 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> index b8a39c219b60..91cbdd24572f 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> @@ -48,6 +48,11 @@
>  /* RCP unit config (Gen8+) */
>  #define RCP_CONFIG				_MMIO(0xd08)
>  
> +#define CTX_WA_PTR				_MMIO(0x2058)
> +#define CTX_WA_PTR_ADDR_MASK			REG_GENMASK(31, 12)
> +#define CTX_WA_TYPE_MASK			REG_GENMASK(4, 3)
> +#define CTX_WA_VALID				REG_BIT(0)

This register isn't at the right place.  Also the bit definitions are
missing a couple spaces before the bitfield names.

> +
>  #define RC6_LOCATION				_MMIO(0xd40)
>  #define   RC6_CTX_IN_DRAM			(1 << 0)
>  #define RC6_CTX_BASE				_MMIO(0xd48)
> diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c
> index 908a3d0f2343..9589af2e8ca3 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rc6.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c
> @@ -12,6 +12,7 @@
>  #include "i915_vgpu.h"
>  #include "intel_engine_regs.h"
>  #include "intel_gt.h"
> +#include "intel_gpu_commands.h"
>  #include "intel_gt_pm.h"
>  #include "intel_gt_regs.h"
>  #include "intel_pcode.h"
> @@ -38,6 +39,7 @@
>   * require higher latency to switch to and wake up.
>   */
>  
> +#define RC6_CTX_WA_BB_SIZE (PAGE_SIZE)

Do we really need this?  It's unlikely that we'll wind up needing more
than a page (and we're not even trying to check in the code below either).

>  static struct intel_gt *rc6_to_gt(struct intel_rc6 *rc6)
>  {
>  	return container_of(rc6, struct intel_gt, rc6);
> @@ -53,8 +55,86 @@ static struct drm_i915_private *rc6_to_i915(struct intel_rc6 *rc)
>  	return rc6_to_gt(rc)->i915;
>  }
>  
> +static int rc6_wa_bb_ctx_init(struct intel_rc6 *rc6)
> +{
> +	struct drm_i915_private *i915 = rc6_to_i915(rc6);
> +	struct intel_gt *gt = rc6_to_gt(rc6);
> +	struct drm_i915_gem_object *obj;
> +	struct i915_vma *vma;
> +	void *batch;
> +	struct i915_gem_ww_ctx ww;
> +	int err;
> +
> +	obj = i915_gem_object_create_shmem(i915, RC6_CTX_WA_BB_SIZE);
> +	if (IS_ERR(obj))
> +		return PTR_ERR(obj);
> +
> +	vma = i915_vma_instance(obj, &gt->ggtt->vm, NULL);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto err;
> +	}
> +	rc6->vma = vma;
> +	i915_gem_ww_ctx_init(&ww, true);
> +retry:
> +	err = i915_gem_object_lock(rc6->vma->obj, &ww);

Nitpick:  Can't we just use the shorter local variable 'obj' here and in
the pin_map?

> +	if (!err)
> +		err = i915_ggtt_pin(rc6->vma, &ww, 0, PIN_HIGH);
> +	if (err)
> +		goto err_ww_fini;
> +
> +	batch = i915_gem_object_pin_map(rc6->vma->obj, I915_MAP_WB);
> +	if (IS_ERR(batch)) {
> +		err = PTR_ERR(batch);
> +		goto err_unpin;
> +	}
> +	rc6->rc6_wa_bb = batch;
> +	return 0;
> +err_unpin:
> +	if (err)
> +		i915_vma_unpin(rc6->vma);
> +err_ww_fini:
> +	if (err == -EDEADLK) {
> +		err = i915_gem_ww_ctx_backoff(&ww);
> +		if (!err)
> +			goto retry;
> +	}
> +	i915_gem_ww_ctx_fini(&ww);
> +
> +	if (err)
> +		i915_vma_put(rc6->vma);
> +err:
> +	i915_gem_object_put(obj);
> +	return err;
> +}

Where do we clean up all the stuff done in this function?

> +
> +static void rc6_wa_bb_restore_sampler_power_ctx(struct intel_rc6 *rc6)
> +{
> +	struct intel_uncore *uncore = rc6_to_uncore(rc6);
> +	u32 *rc6_wa_bb;
> +
> +	if (!rc6->vma->obj)
> +		return;
> +
> +	rc6_wa_bb = rc6->rc6_wa_bb;
> +	*rc6_wa_bb++ = MI_NOOP;
> +	*rc6_wa_bb++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_FORCE_POSTED;
> +	*rc6_wa_bb++ = i915_mmio_reg_offset(GEN10_SAMPLER_MODE);
> +	*rc6_wa_bb++ = _MASKED_BIT_ENABLE(ENABLE_SMALLPL);

This isn't the only register/bit that we're attempting to program
(unsuccessfully) today; we need to handle all of the programming that
targets the affected TDL registers such as Wa_1406941453.

Rather than including specific register values here (with no explanation
of why we're even programming these), it would be better if we could
just place these in the usual locations where workarounds and tuning
settings are applied.  Then have the code here walk the workaround list
looking for any entries that write to a broken register; if any are
found, add them to the power context batchbuffer here to restore them
manually.  That way the registers/values are clear because they're
implemented in the same place they are for every other platform, and the
code here is future-proof to catch any other uses of these registers in
the future.  You may need to adjust where this batch buffer gets built
to do that (since I think the GT PM init happens before the WAL init).

> +	*rc6_wa_bb++ = MI_NOOP;
> +	*rc6_wa_bb++ = MI_BATCH_BUFFER_END;
> +
> +	i915_gem_object_flush_map(rc6->vma->obj);
> +
> +	intel_uncore_write(uncore, CTX_WA_PTR,
> +			   REG_FIELD_PREP(CTX_WA_PTR_ADDR_MASK,
> +					  i915_vma_offset(rc6->vma) & GENMASK(19, 0)) |

This isn't what we want here; note the "Format:" label on the register
bitfield...the value that you pack into bits 31:12 of the register is
supposed to be bits 31:12 of the address.  I.e., the entire register
should effectively be the address, with the valid bit OR'd into bit 0 at
the end.

> +			   CTX_WA_VALID);
> +}
> +
>  static void gen11_rc6_enable(struct intel_rc6 *rc6)
>  {
> +	struct drm_i915_private *i915 = rc6_to_i915(rc6);
>  	struct intel_gt *gt = rc6_to_gt(rc6);
>  	struct intel_uncore *uncore = gt->uncore;
>  	struct intel_engine_cs *engine;
> @@ -103,6 +183,11 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6)
>  	intel_uncore_write_fw(uncore, GEN9_MEDIA_PG_IDLE_HYSTERESIS, 60);
>  	intel_uncore_write_fw(uncore, GEN9_RENDER_PG_IDLE_HYSTERESIS, 60);
>  
> +	/* Wa_14011282866 Restore sampler power context */

We're inserting this into the middle of a numbered sequence.  I'm not
sure where that sequence comes from, but putting it here makes it look
as if this was part of step "2c: Program Coarse Power Gating Policies."

> +	if (IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) ||
> +	    IS_ALDERLAKE_S(i915) || IS_ALDERLAKE_P(i915))

If we're using a long platform list like this in multiple places to
implement a single workaround, it's generally best to make a dedicated
function for it so that any future changes to the workaround bounds can
be adjusted in just one place.

Matt

> +		rc6_wa_bb_restore_sampler_power_ctx(rc6);
> +
>  	/* 3a: Enable RC6
>  	 *
>  	 * With GuCRC, we do not enable bit 31 of RC_CTL,
> @@ -610,6 +695,9 @@ void intel_rc6_init(struct intel_rc6 *rc6)
>  		err = chv_rc6_init(rc6);
>  	else if (IS_VALLEYVIEW(i915))
>  		err = vlv_rc6_init(rc6);
> +	else if (IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) ||
> +		 IS_ALDERLAKE_S(i915) || IS_ALDERLAKE_P(i915))
> +		err = rc6_wa_bb_ctx_init(rc6);
>  	else
>  		err = 0;
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_rc6_types.h b/drivers/gpu/drm/i915/gt/intel_rc6_types.h
> index cd4587098162..643fd4e839ad 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rc6_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_rc6_types.h
> @@ -33,6 +33,9 @@ struct intel_rc6 {
>  
>  	struct drm_i915_gem_object *pctx;
>  
> +	u32 *rc6_wa_bb;
> +	struct i915_vma *vma;
> +
>  	bool supported : 1;
>  	bool enabled : 1;
>  	bool manual : 1;
> -- 
> 2.40.0
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation