Re: [PATCH v2 1/2] drm/i915: Add WABB blit for Wa_16018031267 / Wa_16018063123

Matt Roper <matthew.d.roper@xxxxxxxxx> · Thu, 24 Aug 2023 10:57:41 -0700

On Wed, Aug 23, 2023 at 11:51:03AM -0700, Jonathan Cavitt wrote:
> From: Nirmoy Das <nirmoy.das@xxxxxxxxx>
> 
> Apply WABB blit for Wa_16018031267 / Wa_16018063123.
> Additionally, update the lrc selftest to exercise the new
> WABB changes.
> 
> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@xxxxxxxxx>
> Co-developed-by: Nirmoy Das <nirmoy.das@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_regs.h |   3 +
>  drivers/gpu/drm/i915/gt/intel_gt_types.h    |   3 +
>  drivers/gpu/drm/i915/gt/intel_lrc.c         | 114 +++++++++++++++++++-
>  drivers/gpu/drm/i915/gt/selftest_lrc.c      |  65 +++++++----
>  drivers/gpu/drm/i915/i915_drv.h             |   5 +
>  5 files changed, 169 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_regs.h b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> index 6b9d9f837669..2e06bea73297 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_regs.h
> @@ -118,6 +118,9 @@
>  #define   CCID_EXTENDED_STATE_RESTORE		BIT(2)
>  #define   CCID_EXTENDED_STATE_SAVE		BIT(3)
>  #define RING_BB_PER_CTX_PTR(base)		_MMIO((base) + 0x1c0) /* gen8+ */
> +#define   PER_CTX_BB_FORCE			BIT(2)
> +#define   PER_CTX_BB_VALID			BIT(0)
> +
>  #define RING_INDIRECT_CTX(base)			_MMIO((base) + 0x1c4) /* gen8+ */
>  #define RING_INDIRECT_CTX_OFFSET(base)		_MMIO((base) + 0x1c8) /* gen8+ */
>  #define ECOSKPD(base)				_MMIO((base) + 0x1d0)
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> index def7dd0eb6f1..81989659ff78 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> @@ -307,6 +307,9 @@ enum intel_gt_scratch_field {
>  
>  	/* 8 bytes */
>  	INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
> +
> +	/* 8 bytes */

This section of scratch is being used as the target of a dummy blit with
destination stride = (0x3F + 1) and height 5.  That's more than just 8
bytes.

> +	INTEL_GT_SCRATCH_FIELD_DUMMY_BLIT = 384,
>  };
>  
>  #define intel_gt_support_legacy_fencing(gt) ((gt)->ggtt->num_fences > 0)
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 967fe4d77a87..444ad1977b10 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -828,6 +828,18 @@ lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
>  	return 0;
>  }
>  
> +static void
> +lrc_setup_bb_per_ctx(u32 *regs,
> +		     const struct intel_engine_cs *engine,
> +		     u32 ctx_bb_ggtt_addr)
> +{
> +	GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
> +	regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
> +		ctx_bb_ggtt_addr |
> +		PER_CTX_BB_FORCE |
> +		PER_CTX_BB_VALID;
> +}
> +
>  static void
>  lrc_setup_indirect_ctx(u32 *regs,
>  		       const struct intel_engine_cs *engine,
> @@ -997,7 +1009,18 @@ static u32 context_wa_bb_offset(const struct intel_context *ce)
>  	return PAGE_SIZE * ce->wa_bb_page;
>  }
>  
> -static u32 *context_indirect_bb(const struct intel_context *ce)
> +/**

Just a normal comment is fine here; we generally don't use formal
kerneldoc on static functions.

> + * context_wabb -
> + * Generates the location of the desired batch buffer used for workarounds
> + * @ce:		The context used for the workaround.
> + * @per_ctx:	When enabled, the function returns the location of
> + * 		the PER_CTX_BB.  When disabled, the function returns
> + * 		the location of the INDIRECT_CTX.

Nitpick:  s/enabled/true/, s/disabled/false/

> + *
> + * Returns: The location of the PER_CTX_BB or INDIRECT_CTX in the ce
> + * context, depending on if per_ctx is true or false, respectively.
> + */
> +static u32 *context_wabb(const struct intel_context *ce, bool per_ctx)
>  {
>  	void *ptr;
>  
> @@ -1006,6 +1029,7 @@ static u32 *context_indirect_bb(const struct intel_context *ce)
>  	ptr = ce->lrc_reg_state;
>  	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
>  	ptr += context_wa_bb_offset(ce);
> +	ptr += per_ctx ? PAGE_SIZE : 0;
>  
>  	return ptr;
>  }
> @@ -1082,7 +1106,8 @@ __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
>  
>  	if (GRAPHICS_VER(engine->i915) >= 12) {
>  		ce->wa_bb_page = context_size / PAGE_SIZE;
> -		context_size += PAGE_SIZE;
> +		/* INDIRECT_CTX and PER_CTX_BB need separate pages. */
> +		context_size += PAGE_SIZE * 2;
>  	}
>  
>  	if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
> @@ -1370,12 +1395,94 @@ gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
>  	return gen12_emit_aux_table_inv(ce->engine, cs);
>  }
>  
> +static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs)
> +{
> +	struct intel_gt *gt = ce->engine->gt;
> +	int mocs = gt->mocs.uc_index << 1;
> +	u32 addr = intel_gt_scratch_offset(gt, INTEL_GT_SCRATCH_FIELD_DUMMY_BLIT); 
> +
> +	/**
> +	 * Wa_16018031267 / Wa_16018063123 requires that SW forces the
> +	 * main copy engine arbitration into round robin mode.  We
> +	 * additionally need to submit the following WABB blt command
> +	 * to produce 4 subblits with each subblit generating 0 byte
> +	 * write requests as WABB:
> +	 *
> +	 * XY_FASTCOLOR_BLT
> +	 *  BG0    -> 5100000E
> +	 *  BG1    -> 0000003F (Dest pitch)
> +	 *  BG2    -> 00000000 (X1, Y1) = (0, 0)
> +	 *  BG3    -> 00040001 (X2, Y2) = (1, 4)
> +	 *  BG4    -> scratch
> +	 *  BG5    -> scratch
> +	 *  BG6-12 -> 00000000
> +	 *  BG13   -> 20004004 (Surf. Width= 2,Surf. Height = 5 )
> +	 *  BG14   -> 00000010 (Qpitch = 4)
> +	 *  BG15   -> 00000000
> +	 */
> +	*cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2);
> +	*cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f;
> +	*cs++ = 0;
> +	*cs++ = 4 << 16 | 1;
> +	*cs++ = addr;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +	*cs++ = 0;
> +	*cs++ = 0x20004004;
> +	*cs++ = 0x10;
> +	*cs++ = 0;
> +
> +	*cs++ = MI_BATCH_BUFFER_END;
> +
> +	return cs;
> +}
> +
> +static u32 *
> +xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs)
> +{
> +	/* Wa_16018031267, Wa_16018063123 */
> +	if (ce->engine->class == COPY_ENGINE_CLASS &&
> +	    NEEDS_FASTCOLOR_BLT_WABB(ce->engine->i915))
> +		cs = xehp_emit_fastcolor_blt_wabb(ce, cs);
> +
> +	return cs;
> +}
> +
> +
> +static void
> +setup_per_ctx_bb(const struct intel_context *ce,
> +		 const struct intel_engine_cs *engine,
> +		 u32 *(*emit)(const struct intel_context *, u32 *))
> +{
> +	/* Place PER_CTX_BB on next page after INDIRECT_CTX */
> +	u32 * const start = context_wabb(ce, true);
> +	u32 *cs;
> +
> +	cs = emit(ce, start);
> +
> +	/* Skip PER_CTX_BB setup when not needed. */
> +	if (cs == start)
> +		return;
> +
> +	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
> +	while ((unsigned long)cs % CACHELINE_BYTES)
> +		*cs++ = MI_NOOP;
> +
> +	lrc_setup_bb_per_ctx(ce->lrc_reg_state, engine,
> +			     lrc_indirect_bb(ce) + PAGE_SIZE / sizeof(*cs));

What's the "/ sizeof(*cs)" for?  Doesn't the per_ctx_bb come a full page
after the indirect_ctx?

> +}
> +
>  static void
>  setup_indirect_ctx_bb(const struct intel_context *ce,
>  		      const struct intel_engine_cs *engine,
>  		      u32 *(*emit)(const struct intel_context *, u32 *))
>  {
> -	u32 * const start = context_indirect_bb(ce);
> +	u32 * const start = context_wabb(ce, false);
>  	u32 *cs;
>  
>  	cs = emit(ce, start);
> @@ -1474,6 +1581,7 @@ u32 lrc_update_regs(const struct intel_context *ce,
>  		/* Mutually exclusive wrt to global indirect bb */
>  		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
>  		setup_indirect_ctx_bb(ce, engine, fn);
> +		setup_per_ctx_bb(ce, engine, xehp_emit_per_ctx_bb);
>  	}
>  
>  	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 5f826b6dcf5d..f1dce05bbfb7 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -1555,7 +1555,7 @@ static int live_lrc_isolation(void *arg)
>  	return err;
>  }
>  
> -static int indirect_ctx_submit_req(struct intel_context *ce)
> +static int wabb_ctx_submit_req(struct intel_context *ce)
>  {
>  	struct i915_request *rq;
>  	int err = 0;
> @@ -1579,7 +1579,8 @@ static int indirect_ctx_submit_req(struct intel_context *ce)
>  #define CTX_BB_CANARY_INDEX  (CTX_BB_CANARY_OFFSET / sizeof(u32))
>  
>  static u32 *
> -emit_indirect_ctx_bb_canary(const struct intel_context *ce, u32 *cs)
> +emit_wabb_ctx_canary(const struct intel_context *ce,
> +			    u32 *cs, bool per_ctx)
>  {
>  	*cs++ = MI_STORE_REGISTER_MEM_GEN8 |
>  		MI_SRM_LRM_GLOBAL_GTT |
> @@ -1587,26 +1588,43 @@ emit_indirect_ctx_bb_canary(const struct intel_context *ce, u32 *cs)
>  	*cs++ = i915_mmio_reg_offset(RING_START(0));
>  	*cs++ = i915_ggtt_offset(ce->state) +
>  		context_wa_bb_offset(ce) +
> -		CTX_BB_CANARY_OFFSET;
> +		CTX_BB_CANARY_OFFSET +
> +		(per_ctx ? PAGE_SIZE : 0);
>  	*cs++ = 0;
>  
>  	return cs;
>  }
>  
> +static u32 *
> +emit_indirect_ctx_bb_canary(const struct intel_context *ce, u32 *cs)
> +{
> +	return emit_wabb_ctx_canary(ce, cs, false);
> +}
> +
> +static u32 *
> +emit_per_ctx_bb_canary(const struct intel_context *ce, u32 *cs)
> +{
> +        return emit_wabb_ctx_canary(ce, cs, true);
> +}
> +
>  static void
> -indirect_ctx_bb_setup(struct intel_context *ce)
> +wabb_ctx_setup(struct intel_context *ce, bool per_ctx)
>  {
> -	u32 *cs = context_indirect_bb(ce);
> +	u32 *cs = context_wabb(ce, per_ctx);
>  
>  	cs[CTX_BB_CANARY_INDEX] = 0xdeadf00d;
>  
> -	setup_indirect_ctx_bb(ce, ce->engine, emit_indirect_ctx_bb_canary);
> +	if (per_ctx)
> +		setup_per_ctx_bb(ce, ce->engine, emit_per_ctx_bb_canary);
> +	else
> +		setup_indirect_ctx_bb(ce, ce->engine, emit_indirect_ctx_bb_canary);
>  }
>  
> -static bool check_ring_start(struct intel_context *ce)
> +static bool check_ring_start(struct intel_context *ce, bool per_ctx)
>  {
>  	const u32 * const ctx_bb = (void *)(ce->lrc_reg_state) -
> -		LRC_STATE_OFFSET + context_wa_bb_offset(ce);
> +		LRC_STATE_OFFSET + context_wa_bb_offset(ce) +
> +		(per_ctx ? PAGE_SIZE / sizeof(u32) : 0);
>  
>  	if (ctx_bb[CTX_BB_CANARY_INDEX] == ce->lrc_reg_state[CTX_RING_START])
>  		return true;
> @@ -1618,21 +1636,21 @@ static bool check_ring_start(struct intel_context *ce)
>  	return false;
>  }
>  
> -static int indirect_ctx_bb_check(struct intel_context *ce)
> +static int wabb_ctx_check(struct intel_context *ce, bool per_ctx)
>  {
>  	int err;
>  
> -	err = indirect_ctx_submit_req(ce);
> +	err = wabb_ctx_submit_req(ce);
>  	if (err)
>  		return err;
>  
> -	if (!check_ring_start(ce))
> +	if (!check_ring_start(ce, per_ctx))
>  		return -EINVAL;
>  
>  	return 0;
>  }
>  
> -static int __live_lrc_indirect_ctx_bb(struct intel_engine_cs *engine)
> +static int __lrc_wabb_ctx(struct intel_engine_cs *engine, bool per_ctx)
>  {
>  	struct intel_context *a, *b;
>  	int err;
> @@ -1667,14 +1685,14 @@ static int __live_lrc_indirect_ctx_bb(struct intel_engine_cs *engine)
>  	 * As ring start is restored apriori of starting the indirect ctx bb and
>  	 * as it will be different for each context, it fits to this purpose.
>  	 */
> -	indirect_ctx_bb_setup(a);
> -	indirect_ctx_bb_setup(b);
> +	wabb_ctx_setup(a, per_ctx);
> +	wabb_ctx_setup(b, per_ctx);
>  
> -	err = indirect_ctx_bb_check(a);
> +	err = wabb_ctx_check(a, per_ctx);
>  	if (err)
>  		goto unpin_b;
>  
> -	err = indirect_ctx_bb_check(b);
> +	err = wabb_ctx_check(b, per_ctx);
>  
>  unpin_b:
>  	intel_context_unpin(b);
> @@ -1688,7 +1706,7 @@ static int __live_lrc_indirect_ctx_bb(struct intel_engine_cs *engine)
>  	return err;
>  }
>  
> -static int live_lrc_indirect_ctx_bb(void *arg)
> +static int lrc_wabb_ctx(void *arg, bool per_ctx)
>  {
>  	struct intel_gt *gt = arg;
>  	struct intel_engine_cs *engine;
> @@ -1697,7 +1715,7 @@ static int live_lrc_indirect_ctx_bb(void *arg)
>  
>  	for_each_engine(engine, gt, id) {
>  		intel_engine_pm_get(engine);
> -		err = __live_lrc_indirect_ctx_bb(engine);
> +		err = __lrc_wabb_ctx(engine, per_ctx);
>  		intel_engine_pm_put(engine);
>  
>  		if (igt_flush_test(gt->i915))
> @@ -1710,6 +1728,16 @@ static int live_lrc_indirect_ctx_bb(void *arg)
>  	return err;
>  }
>  
> +static int live_lrc_indirect_ctx_bb(void *arg)
> +{
> +	return lrc_wabb_ctx(arg, false);
> +}
> +
> +static int live_lrc_per_ctx_bb(void *arg)
> +{
> +	return lrc_wabb_ctx(arg, true);
> +}		
> +
>  static void garbage_reset(struct intel_engine_cs *engine,
>  			  struct i915_request *rq)
>  {
> @@ -1947,6 +1975,7 @@ int intel_lrc_live_selftests(struct drm_i915_private *i915)
>  		SUBTEST(live_lrc_garbage),
>  		SUBTEST(live_pphwsp_runtime),
>  		SUBTEST(live_lrc_indirect_ctx_bb),
> +		SUBTEST(live_lrc_per_ctx_bb),
>  	};
>  
>  	if (!HAS_LOGICAL_RING_CONTEXTS(i915))
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 87ffc477c3b1..15b54b3beaa5 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -813,4 +813,9 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
>  #define HAS_LMEMBAR_SMEM_STOLEN(i915) (!HAS_LMEM(i915) && \
>  				       GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))
>  
> +#define NEEDS_FASTCOLOR_BLT_WABB(i915)	(GRAPHICS_VER_FULL(i915) == IP_VER(12, 70) || \
> +					 GRAPHICS_VER_FULL(i915) == IP_VER(12, 71) || \
> +					 IS_PONTEVECCHIO(i915) || \
> +					 IS_DG2(i915))

Looks like the workaround is still in pending state for DG2 and PVC, but
assuming that goes through to completion, we can write this condition
with the new IP range check:

   #define NEEDS_FASTCOLOR_BLT_WABB(gt) \
        IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 55), IP_VER(12, 71));

since there aren't any platforms in the range that the workaround
doesn't apply to.

It might be best to keep this #define in the file that uses it.
i915_drv.h is already too cluttered and this isn't something that needs
to be globally available to the entire driver.

Matt

> +
>  #endif
> -- 
> 2.25.1
> 

-- 
Matt Roper
Graphics Software Engineer
Linux GPU Platform Enablement
Intel Corporation