Re: [PATCH v2 13/14] drm/i915/mtl: Add multicast steering for render GT

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Oct 14, 2022 at 09:32:55PM +0530, Balasubramani Vivekanandan wrote:
> On 30.09.2022 17:45, Matt Roper wrote:
> > MTL once again changes the multicast register types and steering
> > details.  Key changes from past platforms:
> >  * The number of instances of some MCR types (NODE, OAAL2, and GAM) vary
> >    according to the MTL subplatform and cannot be read from fuse
> >    registers.
> >  * The MCR steering register (and its bitfields) has changed.
> > 
> > Unlike past platforms, we will be explicitly steering all types of MCR
> > accesses, including those for "SLICE" and "DSS" ranges; we no longer
> > rely on implicit steering.  On previous platforms, various
> > hardware/firmware agents that needed to access registers typically had
> > their own steering control registers, allowing them to perform multicast
> > steering without clobbering the CPU/kernel steering.  Starting with MTL,
> > more of these agents now share a single steering register (0xFD4) and it
> > is no longer safe for us to assume that the value will remain unchanged
> > from how we initialized it during startup.  There is also a slight
> > chance of race conditions between the driver and a hardware/firmware
> > agent, so the hardware provides a semaphore register that can be used to
> > coordinate access to the steering register.  Support for the semaphore
> > register will be introduced in a future patch.
> > 
> > Bspec: 67788, 67112
> > Cc: Radhakrishna Sripada <radhakrishna.sripada@xxxxxxxxx>
> > Signed-off-by: Matt Roper <matthew.d.roper@xxxxxxxxx>
> > ---
> >  drivers/gpu/drm/i915/gt/intel_gt_mcr.c      | 85 ++++++++++++++++++---
> >  drivers/gpu/drm/i915/gt/intel_gt_regs.h     |  5 ++
> >  drivers/gpu/drm/i915/gt/intel_gt_types.h    |  8 +-
> >  drivers/gpu/drm/i915/gt/intel_workarounds.c | 18 ++++-
> >  drivers/gpu/drm/i915/i915_pci.c             |  1 +
> >  5 files changed, 102 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
> > index 9e2caba64f19..a1fa71b47831 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c
> > @@ -41,6 +41,7 @@ static const char * const intel_steering_types[] = {
> >  	"MSLICE",
> >  	"LNCF",
> >  	"GAM",
> > +	"DSS",
> >  	"INSTANCE 0",
> >  };
> >  
> > @@ -99,9 +100,40 @@ static const struct intel_mmio_range pvc_instance0_steering_table[] = {
> >  	{},
> >  };
> >  
> > +static const struct intel_mmio_range mtl3d_instance0_steering_table[] = {
> > +	{ 0x000B00, 0x000BFF },         /* SQIDI */
> > +	{ 0x001000, 0x001FFF },         /* SQIDI */
> > +	{ 0x004000, 0x0048FF },         /* GAM */
> > +	{ 0x008700, 0x0087FF },         /* SQIDI */
> > +	{ 0x00B000, 0x00B0FF },         /* NODE */
> > +	{ 0x00C800, 0x00CFFF },         /* GAM */
> > +	{ 0x00D880, 0x00D8FF },         /* NODE */
> > +	{ 0x00DD00, 0x00DDFF },         /* OAAL2 */
> > +	{},
> > +};
> > +
> > +static const struct intel_mmio_range mtl3d_l3bank_steering_table[] = {
> > +	{ 0x00B100, 0x00B3FF },
> > +	{},
> > +};
> > +
> > +/* DSS steering is used for SLICE ranges as well */
> > +static const struct intel_mmio_range mtl3d_dss_steering_table[] = {
> > +	{ 0x005200, 0x0052FF },		/* SLICE */
> > +	{ 0x005500, 0x007FFF },		/* SLICE */
> > +	{ 0x008140, 0x00815F },		/* SLICE (0x8140-0x814F), DSS (0x8150-0x815F) */
> > +	{ 0x0094D0, 0x00955F },		/* SLICE (0x94D0-0x951F), DSS (0x9520-0x955F) */
> > +	{ 0x009680, 0x0096FF },		/* DSS */
> > +	{ 0x00D800, 0x00D87F },		/* SLICE */
> > +	{ 0x00DC00, 0x00DCFF },		/* SLICE */
> > +	{ 0x00DE80, 0x00E8FF },		/* DSS (0xE000-0xE0FF reserved) */
> > +};
> > +
> >  void intel_gt_mcr_init(struct intel_gt *gt)
> >  {
> >  	struct drm_i915_private *i915 = gt->i915;
> > +	unsigned long fuse;
> > +	int i;
> >  
> >  	/*
> >  	 * An mslice is unavailable only if both the meml3 for the slice is
> > @@ -119,7 +151,22 @@ void intel_gt_mcr_init(struct intel_gt *gt)
> >  			drm_warn(&i915->drm, "mslice mask all zero!\n");
> >  	}
> >  
> > -	if (IS_PONTEVECCHIO(i915)) {
> > +	if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70) &&
> > +	    gt->type == GT_PRIMARY) {
> > +		fuse = REG_FIELD_GET(GT_L3_EXC_MASK,
> > +				     intel_uncore_read(gt->uncore, XEHP_FUSE4));
> > +
> > +		/*
> > +		 * Despite the register field being named "exclude mask" the
> > +		 * bits actually represent enabled banks (two banks per bit).
> > +		 */
> > +		for_each_set_bit(i, &fuse, 3)
> > +			gt->info.l3bank_mask |= (0x3 << 2*i);
> > +
> > +		gt->steering_table[INSTANCE0] = mtl3d_instance0_steering_table;
> > +		gt->steering_table[L3BANK] = mtl3d_l3bank_steering_table;
> > +		gt->steering_table[DSS] = mtl3d_dss_steering_table;
> > +	} else if (IS_PONTEVECCHIO(i915)) {
> >  		gt->steering_table[INSTANCE0] = pvc_instance0_steering_table;
> >  	} else if (IS_DG2(i915)) {
> >  		gt->steering_table[MSLICE] = xehpsdv_mslice_steering_table;
> > @@ -184,7 +231,12 @@ static u32 rw_with_mcr_steering_fw(struct intel_uncore *uncore,
> >  
> >  	lockdep_assert_held(&uncore->lock);
> >  
> > -	if (GRAPHICS_VER(uncore->i915) >= 11) {
> > +	if (GRAPHICS_VER_FULL(uncore->i915) >= IP_VER(12, 70)) {
> > +		/* No need to save old steering reg value */
> > +		intel_uncore_write_fw(uncore, MTL_MCR_SELECTOR,
> > +				      REG_FIELD_PREP(MTL_MCR_GROUPID, group) |
> > +				      REG_FIELD_PREP(MTL_MCR_INSTANCEID, instance));
> 
> I think we need to clear the MULTICAST bit in the MTL_MCR_SELECTOR
> register here since we are doing unicast r/w.

Since this is a write operation rather than a RMW, the resulting
register value is exactly the fields we set here (group and instance)
and the multicast bit is guaranteed to be clear after this.

However that does remind me that even though we don't need to restore
the original steering target, we do need to restore the MULTICAST bit to
'1' at the end of this function:

        "Prior to releasing the semaphore, multicast is returned to the
        default (1)."

We haven't implemented use of the hardware semaphore yet (that will come
in a future series), but in general any agent operating on MCR registers
should be guaranteed to have multicast behavior by default.  I'll make
that update in the next version.

And I guess we should probably also _set_ the multicast bit when doing
reads since reads don't actually care about the value of that bit
(they're always inherently a unicast operation), but based on
Wa_22013088509 it's safest to just leave multicast enabled at all times
except when we have a specific need to clear it (i.e., unicast writes).

> 
> And in the functions intel_gt_mcr_multicast_write() and
> intel_gt_mcr_multicast_write_fw(), we need to set the MULTICAST bit
> in the MTL_MCR_SELECTOR register for MTL since we no more rely on
> default values for the MTL_MCR_SELECTOR.

Based on the above, the one thing we're supposed to be guaranteed is
that the multicast bit is always set, unless we've intentionally
switched it to unicast mode temporarily (and in that case we're supposed
to switch it back to multicast at the end).  But it doesn't hurt to
explicitly set that bit, just to be safe; I'll include that too.

> 
> Can I also suggest to optimize the groupid and instanceid calculation
> during non-terminating unicast MCR read/write by caching the groupid and
> instanceid in some variables instead of calculating it everytime?
> Because from MTL we have replaced all MCR register reads with implicit
> steering to intel_gt_mcr_read_any/_fw call, and each call of that
> function does groupip/instanceid calculation.

Are you suggesting that get_nonterminated_steering() should return
pre-computed values rather than calculating them on each call?  We could
probably do that, but it kind of seems like a micro-optimization that
might not provide much actual benefit.  The heaviest steering
calculations basically boil down to an __ffs() operation which should
already be super fast and assembly-optimized.  And given that most of
the places that 'read any' operations happen are stuff like workarounds,
error state dumps, etc. that aren't on a hot path, I'm not sure if it's
worth adding the extra complexity unless we think there's a place where
it could measurably improve performance.


Matt

> 
> Regards,
> Bala
> 
> > +	} else if (GRAPHICS_VER(uncore->i915) >= 11) {
> >  		mcr_mask = GEN11_MCR_SLICE_MASK | GEN11_MCR_SUBSLICE_MASK;
> >  		mcr_ss = GEN11_MCR_SLICE(group) | GEN11_MCR_SUBSLICE(instance);
> >  
> > @@ -202,26 +254,30 @@ static u32 rw_with_mcr_steering_fw(struct intel_uncore *uncore,
> >  		 */
> >  		if (rw_flag == FW_REG_WRITE)
> >  			mcr_mask |= GEN11_MCR_MULTICAST;
> > +
> > +		old_mcr = mcr = intel_uncore_read_fw(uncore, GEN8_MCR_SELECTOR);
> > +
> > +		mcr &= ~mcr_mask;
> > +		mcr |= mcr_ss;
> > +		intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr);
> >  	} else {
> >  		mcr_mask = GEN8_MCR_SLICE_MASK | GEN8_MCR_SUBSLICE_MASK;
> >  		mcr_ss = GEN8_MCR_SLICE(group) | GEN8_MCR_SUBSLICE(instance);
> > -	}
> >  
> > -	old_mcr = mcr = intel_uncore_read_fw(uncore, GEN8_MCR_SELECTOR);
> > +		old_mcr = mcr = intel_uncore_read_fw(uncore, GEN8_MCR_SELECTOR);
> >  
> > -	mcr &= ~mcr_mask;
> > -	mcr |= mcr_ss;
> > -	intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr);
> > +		mcr &= ~mcr_mask;
> > +		mcr |= mcr_ss;
> > +		intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr);
> > +	}
> >  
> >  	if (rw_flag == FW_REG_READ)
> >  		val = intel_uncore_read_fw(uncore, mcr_reg_cast(reg));
> >  	else
> >  		intel_uncore_write_fw(uncore, mcr_reg_cast(reg), value);
> >  
> > -	mcr &= ~mcr_mask;
> > -	mcr |= old_mcr & mcr_mask;
> > -
> > -	intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr);
> > +	if (GRAPHICS_VER_FULL(uncore->i915) < IP_VER(12, 70))
> > +		intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr);
> >  
> >  	return val;
> >  }
> > @@ -385,6 +441,8 @@ static void get_nonterminated_steering(struct intel_gt *gt,
> >  				       enum intel_steering_type type,
> >  				       u8 *group, u8 *instance)
> >  {
> > +	u32 dss;
> > +
> >  	switch (type) {
> >  	case L3BANK:
> >  		*group = 0;		/* unused */
> > @@ -408,6 +466,11 @@ static void get_nonterminated_steering(struct intel_gt *gt,
> >  		*group = IS_DG2(gt->i915) ? 1 : 0;
> >  		*instance = 0;
> >  		break;
> > +	case DSS:
> > +		dss = intel_sseu_find_first_xehp_dss(&gt->info.sseu, 0, 0);
> > +		*group = dss / GEN_DSS_PER_GSLICE;
> > +		*instance = dss % GEN_DSS_PER_GSLICE;
> > +		break;
> >  	case INSTANCE0:
> >  		/*
> >  		 * There are a lot of MCR types for which instance (0, 0)
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_regs.h b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > index b7341f01ec9f..c5b9671097e3 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt_regs.h
> > @@ -59,6 +59,7 @@
> >  #define GMD_ID_MEDIA				_MMIO(MTL_MEDIA_GSI_BASE + 0xd8c)
> >  
> >  #define MCFG_MCR_SELECTOR			_MMIO(0xfd0)
> > +#define MTL_MCR_SELECTOR			_MMIO(0xfd4)
> >  #define SF_MCR_SELECTOR				_MMIO(0xfd8)
> >  #define GEN8_MCR_SELECTOR			_MMIO(0xfdc)
> >  #define GAM_MCR_SELECTOR			_MMIO(0xfe0)
> > @@ -71,6 +72,8 @@
> >  #define   GEN11_MCR_SLICE_MASK			GEN11_MCR_SLICE(0xf)
> >  #define   GEN11_MCR_SUBSLICE(subslice)		(((subslice) & 0x7) << 24)
> >  #define   GEN11_MCR_SUBSLICE_MASK		GEN11_MCR_SUBSLICE(0x7)
> > +#define   MTL_MCR_GROUPID			REG_GENMASK(11, 8)
> > +#define   MTL_MCR_INSTANCEID			REG_GENMASK(3, 0)
> >  
> >  #define IPEIR_I965				_MMIO(0x2064)
> >  #define IPEHR_I965				_MMIO(0x2068)
> > @@ -531,6 +534,8 @@
> >  #define   GEN6_MBCTL_BOOT_FETCH_MECH		(1 << 0)
> >  
> >  /* Fuse readout registers for GT */
> > +#define XEHP_FUSE4				_MMIO(0x9114)
> > +#define   GT_L3_EXC_MASK			REG_GENMASK(6, 4)
> >  #define	GEN10_MIRROR_FUSE3			_MMIO(0x9118)
> >  #define   GEN10_L3BANK_PAIR_COUNT		4
> >  #define   GEN10_L3BANK_MASK			0x0F
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> > index 30003d68fd51..77ecbd6be331 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h
> > @@ -60,6 +60,7 @@ enum intel_steering_type {
> >  	MSLICE,
> >  	LNCF,
> >  	GAM,
> > +	DSS,
> >  
> >  	/*
> >  	 * On some platforms there are multiple types of MCR registers that
> > @@ -255,8 +256,6 @@ struct intel_gt {
> >  
> >  		intel_engine_mask_t engine_mask;
> >  
> > -		u32 l3bank_mask;
> > -
> >  		u8 num_engines;
> >  
> >  		/* General presence of SFC units */
> > @@ -268,7 +267,10 @@ struct intel_gt {
> >  		/* Slice/subslice/EU info */
> >  		struct sseu_dev_info sseu;
> >  
> > -		unsigned long mslice_mask;
> > +		union {
> > +			unsigned long mslice_mask;
> > +			unsigned long l3bank_mask;
> > +		};
> >  
> >  		/** @hwconfig: hardware configuration data */
> >  		struct intel_hwconfig hwconfig;
> > diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
> > index 174c74dcda99..9be048da7fb3 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
> > @@ -1581,12 +1581,28 @@ pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
> >  	wa_mcr_write_clr(wal, GEN8_MISCCPCTL, GEN12_DOP_CLOCK_GATE_RENDER_ENABLE);
> >  }
> >  
> > +static void
> > +mtl_3d_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal)
> > +{
> > +	/*
> > +	 * Unlike older platforms, we no longer setup implicit steering here;
> > +	 * all MCR accesses are explicitly steered.
> > +	 */
> > +	if (drm_debug_enabled(DRM_UT_DRIVER)) {
> > +		struct drm_printer p = drm_debug_printer("MCR Steering:");
> > +
> > +		intel_gt_mcr_report_steering(&p, gt, false);
> > +	}
> > +}
> > +
> >  static void
> >  gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal)
> >  {
> >  	struct drm_i915_private *i915 = gt->i915;
> >  
> > -	if (IS_PONTEVECCHIO(i915))
> > +	if (IS_METEORLAKE(i915) && gt->type == GT_PRIMARY)
> > +		mtl_3d_gt_workarounds_init(gt, wal);
> > +	else if (IS_PONTEVECCHIO(i915))
> >  		pvc_gt_workarounds_init(gt, wal);
> >  	else if (IS_DG2(i915))
> >  		dg2_gt_workarounds_init(gt, wal);
> > diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> > index 38460a0bd7cb..bd1d8e0205a6 100644
> > --- a/drivers/gpu/drm/i915/i915_pci.c
> > +++ b/drivers/gpu/drm/i915/i915_pci.c
> > @@ -1145,6 +1145,7 @@ static const struct intel_device_info mtl_info = {
> >  	.extra_gt_list = xelpmp_extra_gt,
> >  	.has_flat_ccs = 0,
> >  	.has_gmd_id = 1,
> > +	.has_mslice_steering = 0,
> >  	.has_snoop = 1,
> >  	.__runtime.memory_regions = REGION_SMEM | REGION_STOLEN_LMEM,
> >  	.__runtime.platform_engine_mask = BIT(RCS0) | BIT(BCS0) | BIT(CCS0),
> > -- 
> > 2.37.3
> > 

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation



[Index of Archives]     [AMD Graphics]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux