>Subject: Re: [Intel-gfx] [PATCH 1/8] drm/i915/mtl: Define MOCS and PAT tables for MTL > > On Fri, Apr 07, 2023 at 12:12:29AM -0700,
fei.yang@xxxxxxxxx wrote: >> From: Fei Yang fei.yang@xxxxxxxxx >> >> On MTL, GT can no longer allocate on LLC - only the CPU can. >> This, along with addition of support for ADM/L4 cache calls a MOCS/PAT
>> table update. Also defines PTE encode functions for MTL as it has
>> different PAT index definition than previous platforms. > > It might be best to keep the PTE encoding as a separate patch from the > MOCS/PAT tables. It's a different enough topic that it probably deserves > a patch of its own. Will update in the next revision. >> >> BSpec: 44509, 45101, 44235 >> >> Cc: Matt Roper
matthew.d.roper@xxxxxxxxx >> Cc: Lucas De Marchi
lucas.demarchi@xxxxxxxxx >> Signed-off-by: Madhumitha Tolakanahalli Pradeep >>
madhumitha.tolakanahalli.pradeep@xxxxxxxxx >> Signed-off-by: Aravind Iddamsetty
aravind.iddamsetty@xxxxxxxxx >> Signed-off-by: Fei Yang
fei.yang@xxxxxxxxx >> --- >> drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 28 +++++++++ >> drivers/gpu/drm/i915/gt/gen8_ppgtt.h | 3 + >> drivers/gpu/drm/i915/gt/intel_ggtt.c | 27 +++++++++ >> drivers/gpu/drm/i915/gt/intel_gtt.c | 23 +++++++- >> drivers/gpu/drm/i915/gt/intel_gtt.h | 20 ++++++- >> drivers/gpu/drm/i915/gt/intel_mocs.c | 76 +++++++++++++++++++++++-- >> drivers/gpu/drm/i915/gt/selftest_mocs.c | 2 +- >> drivers/gpu/drm/i915/i915_pci.c | 1 + >> 8 files changed, 173 insertions(+), 7 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c >> b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c >> index 4daaa6f55668..df4073d32114 100644 >> --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c >> +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c >> @@ -55,6 +55,34 @@ static u64 gen8_pte_encode(dma_addr_t addr, >> return pte; >> } >> >> +static u64 mtl_pte_encode(dma_addr_t addr, >> + enum i915_cache_level level, >> + u32 flags) >> +{ >> + gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW; >> + >> + if (unlikely(flags & PTE_READ_ONLY)) >> + pte &= ~GEN8_PAGE_RW; >> + >> + if (flags & PTE_LM) >> + pte |= GEN12_PPGTT_PTE_LM | GEN12_PPGTT_PTE_NC; >> + >> + switch (level) { >> + case I915_CACHE_NONE: >> + pte |= GEN12_PPGTT_PTE_PAT1; >> + break; >> + case I915_CACHE_LLC: >> + case I915_CACHE_L3_LLC: >> + pte |= GEN12_PPGTT_PTE_PAT0 | GEN12_PPGTT_PTE_PAT1; >> + break; >> + case I915_CACHE_WT: >> + pte |= GEN12_PPGTT_PTE_PAT0; >> + break; >> + } >> + >> + return pte; >> +} >> + >> static void gen8_ppgtt_notify_vgt(struct i915_ppgtt *ppgtt, bool
>> create) { >> struct drm_i915_private *i915 = ppgtt->vm.i915; diff --git
>> a/drivers/gpu/drm/i915/gt/gen8_ppgtt.h >> b/drivers/gpu/drm/i915/gt/gen8_ppgtt.h >> index f541d19264b4..6b8ce7f4d25a 100644 >> --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.h >> +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.h >> @@ -18,5 +18,8 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt
>> *gt, >> u64 gen8_ggtt_pte_encode(dma_addr_t addr, >> enum i915_cache_level level, >> u32 flags); >> +u64 mtl_ggtt_pte_encode(dma_addr_t addr, >> + unsigned int pat_index, >> + u32 flags); >> >> #endif >> diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c >> b/drivers/gpu/drm/i915/gt/intel_ggtt.c >> index 3c7f1ed92f5b..4a16bfcde1de 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c >> +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c >> @@ -220,6 +220,33 @@ static void guc_ggtt_invalidate(struct i915_ggtt *ggtt) >> } >> } >> >> +u64 mtl_ggtt_pte_encode(dma_addr_t addr, >> + enum i915_cache_level level, >> + u32 flags) >> +{ >> + gen8_pte_t pte = addr | GEN8_PAGE_PRESENT; >> + >> + GEM_BUG_ON(addr & ~GEN12_GGTT_PTE_ADDR_MASK); >> + >> + if (flags & PTE_LM) >> + pte |= GEN12_GGTT_PTE_LM; >> + >> + switch (level) { >> + case I915_CACHE_NONE: >> + pte |= MTL_GGTT_PTE_PAT1; >> + break; >> + case I915_CACHE_LLC: >> + case I915_CACHE_L3_LLC: >> + pte |= MTL_GGTT_PTE_PAT0 | MTL_GGTT_PTE_PAT1; >> + break; >> + case I915_CACHE_WT: >> + pte |= MTL_GGTT_PTE_PAT0; >> + break; >> + } >> + >> + return pte; >> +} >> + >> u64 gen8_ggtt_pte_encode(dma_addr_t addr, >> enum i915_cache_level level, >> u32 flags) >> diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c >> b/drivers/gpu/drm/i915/gt/intel_gtt.c >> index 4f436ba7a3c8..1e1b34e22cf5 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_gtt.c >> +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c >> @@ -468,6 +468,25 @@ void gtt_write_workarounds(struct intel_gt *gt) >> } >> } >> >> +static void mtl_setup_private_ppat(struct intel_uncore *uncore) { >> + intel_uncore_write(uncore, GEN12_PAT_INDEX(0), >> + MTL_PPAT_L4_0_WB); >> + intel_uncore_write(uncore, GEN12_PAT_INDEX(1), >> + MTL_PPAT_L4_1_WT); >> + intel_uncore_write(uncore, GEN12_PAT_INDEX(2), >> + MTL_PPAT_L4_3_UC); >> + intel_uncore_write(uncore, GEN12_PAT_INDEX(3), >> + MTL_PPAT_L4_0_WB | MTL_2_COH_1W); >> + intel_uncore_write(uncore, GEN12_PAT_INDEX(4), >> + MTL_PPAT_L4_0_WB | MTL_3_COH_2W); > > These registers are multicast registers on the primary GT. We need > to use XEHP_PAT_INDEX and the MCR-aware functions on the primary GT. The multicast change has been merged to a different branch and not validated extensively on MTL so far. Will post to mailing list after proper validation. > Although we're not programming the higher PAT entries on MTL (yet), > there's also a jump in the MMIO offsets that we should incorporate > into the PAT_INDEX macro to future-proof the code. > >> + >> + /* >> + * Remaining PAT entries are left at the hardware-default >> + * fully-cached setting >> + */ >> +} >> + >> static void tgl_setup_private_ppat(struct intel_uncore *uncore) { >> /* TGL doesn't support LLC or AGE settings */ @@ -603,7 +622,9 @@
>> void setup_private_pat(struct intel_gt *gt) >> >> GEM_BUG_ON(GRAPHICS_VER(i915) < 8); >> >> - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) >> + if (IS_METEORLAKE(i915)) >> + mtl_setup_private_ppat(uncore); > > We need to handle the primary GT and media GT separately since the > media GT registers are unicast whereas primary are multicast. Same as above. >See >
https://gitlab.freedesktop.org/drm/xe/kernel/-/blob/drm-xe-next/drivers/gpu/drm/xe/xe_pat.c > for how this was handled in the Xe driver. > >> + else if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) >> xehp_setup_private_ppat(gt); >> else if (GRAPHICS_VER(i915) >= 12) >> tgl_setup_private_ppat(uncore); >> diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h >> b/drivers/gpu/drm/i915/gt/intel_gtt.h >> index 69ce55f517f5..b632167eaf2e 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_gtt.h >> +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h >> @@ -88,9 +88,18 @@ typedef u64 gen8_pte_t; >> #define BYT_PTE_SNOOPED_BY_CPU_CACHES REG_BIT(2) >> #define BYT_PTE_WRITEABLE REG_BIT(1) >> >> +#define GEN12_PPGTT_PTE_PAT3 BIT_ULL(62) >> #define GEN12_PPGTT_PTE_LM BIT_ULL(11) >> +#define GEN12_PPGTT_PTE_PAT2 BIT_ULL(7) >> +#define GEN12_PPGTT_PTE_NC BIT_ULL(5) >> +#define GEN12_PPGTT_PTE_PAT1 BIT_ULL(4) >> +#define GEN12_PPGTT_PTE_PAT0 BIT_ULL(3) > > Which bspec page is this from? The PPGTT descriptions in > the bspec are kind of hard to track down. https://gfxspecs.intel.com/Predator/Home/Index/53521 PAT_Index[2:0] = {PAT, PCD, PWT} = BIT(7, 4, 3) PAT_Index[3] = BIT(62) PAT_Index[4] = BIT(61) > But if these only apply to MTL, why are they labelled as "GEN12?" These apply to GEN12plus. >> >> -#define GEN12_GGTT_PTE_LM BIT_ULL(1) >> +#define GEN12_GGTT_PTE_LM BIT_ULL(1) >> +#define MTL_GGTT_PTE_PAT0 BIT_ULL(52) >> +#define MTL_GGTT_PTE_PAT1 BIT_ULL(53) >> +#define GEN12_GGTT_PTE_ADDR_MASK GENMASK_ULL(45, 12) >> +#define MTL_GGTT_PTE_PAT_MASK GENMASK_ULL(53, 52) >> >> #define GEN12_PDE_64K BIT(6) >> #define GEN12_PTE_PS64 BIT(8) >> @@ -147,6 +156,15 @@ typedef u64 gen8_pte_t; #define GEN8_PDE_IPS_64K
>> BIT(11) >> #define GEN8_PDE_PS_2M BIT(7) >> >> +#define MTL_PPAT_L4_CACHE_POLICY_MASK REG_GENMASK(3, 2) >> +#define MTL_PAT_INDEX_COH_MODE_MASK REG_GENMASK(1, 0) >> +#define MTL_PPAT_L4_3_UC REG_FIELD_PREP(MTL_PPAT_L4_CACHE_POLICY_MASK, 3) >> +#define MTL_PPAT_L4_1_WT REG_FIELD_PREP(MTL_PPAT_L4_CACHE_POLICY_MASK, 1) >> +#define MTL_PPAT_L4_0_WB REG_FIELD_PREP(MTL_PPAT_L4_CACHE_POLICY_MASK, 0) >> +#define MTL_3_COH_2W REG_FIELD_PREP(MTL_PAT_INDEX_COH_MODE_MASK, 3) >> +#define MTL_2_COH_1W REG_FIELD_PREP(MTL_PAT_INDEX_COH_MODE_MASK, 2) >> +#define MTL_0_COH_NON REG_FIELD_PREP(MTL_PAT_INDEX_COH_MODE_MASK, 0) > >The values for these definitions don't seem to be aligned. These are aligned with
https://gfxspecs.intel.com/Predator/Home/Index/45101 >> + >> enum i915_cache_level; >> >> struct drm_i915_gem_object; >> diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c >> b/drivers/gpu/drm/i915/gt/intel_mocs.c >> index 69b489e8dfed..89570f137b2c 100644 >> --- a/drivers/gpu/drm/i915/gt/intel_mocs.c >> +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c >> @@ -40,6 +40,10 @@ struct drm_i915_mocs_table { >> #define LE_COS(value) ((value) << 15) >> #define LE_SSE(value) ((value) << 17) >> >> +/* Defines for the tables (GLOB_MOCS_0 - GLOB_MOCS_16) */ >> +#define _L4_CACHEABILITY(value) ((value) << 2) > > We should use REG_GENMASK + REG_FIELD_PREP for new code, which will help > ensure that we don't try to pack a value into a field that's too big to fit. > >> +#define IG_PAT(value) ((value) << 8) This is used in defining MOCS table. I think it's easier to read with IG_PAT(0) and IG_PAT(1), than with and without IG_PAT. > Do we need this one parameterized? It's just a boolean flag, so > > #define IG_PAT REG_BIT(8) > > would be fine (and then don't include the flag on the entries where it isn't set. > >> + >> /* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */ >> #define L3_ESC(value) ((value) << 0) >> #define L3_SCC(value) ((value) << 1) >> @@ -50,6 +54,7 @@ struct drm_i915_mocs_table { >> /* Helper defines */ >> #define GEN9_NUM_MOCS_ENTRIES 64 /* 63-64 are reserved, but configured. */ >> #define PVC_NUM_MOCS_ENTRIES 3 >> +#define MTL_NUM_MOCS_ENTRIES 16 >> >> /* (e)LLC caching options */ >> /* >> @@ -73,6 +78,12 @@ struct drm_i915_mocs_table { >> #define L3_2_RESERVED _L3_CACHEABILITY(2) >> #define L3_3_WB _L3_CACHEABILITY(3) >> >> +/* L4 caching options */ >> +#define L4_0_WB _L4_CACHEABILITY(0) >> +#define L4_1_WT _L4_CACHEABILITY(1) >> +#define L4_2_RESERVED _L4_CACHEABILITY(2) > > This definition is unnecessary and unused. > >> +#define L4_3_UC _L4_CACHEABILITY(3) >> + >> #define MOCS_ENTRY(__idx, __control_value, __l3cc_value) \ >> [__idx] = { \ >> .control_value = __control_value, \ @@ -416,6 +427,57 @@ static
>> const struct drm_i915_mocs_entry pvc_mocs_table[] = { >> MOCS_ENTRY(2, 0, L3_3_WB), >> }; >> >> +static const struct drm_i915_mocs_entry mtl_mocs_table[] = { > > The entries below with WB L4 cache don't include L4_0_WB. That works > out okay since the value is 0, but it seems like it would still be worth > including it, just to make the table meaning more clear. > > Conversely, I'd drop the "IG_PAT(0)" on entry 0 since that's just a simple > boolean flag, so the "0" there is easier to misread than just not including > it at all. > >> + /* Error - Reserved for Non-Use */ >> + MOCS_ENTRY(0, >> + IG_PAT(0), >> + L3_LKUP(1) | L3_3_WB), >> + /* Cached - L3 + L4 */ >> + MOCS_ENTRY(1, >> + IG_PAT(1), >> + L3_LKUP(1) | L3_3_WB), >> + /* L4 - GO:L3 */ >> + MOCS_ENTRY(2, >> + IG_PAT(1), >> + L3_LKUP(1) | L3_1_UC), >> + /* Uncached - GO:L3 */ >> + MOCS_ENTRY(3, >> + IG_PAT(1) | L4_3_UC, >> + L3_LKUP(1) | L3_1_UC), >> + /* L4 - GO:Mem */ >> + MOCS_ENTRY(4, >> + IG_PAT(1), >> + L3_LKUP(1) | L3_GLBGO(1) | L3_1_UC), >> + /* Uncached - GO:Mem */ >> + MOCS_ENTRY(5, >> + IG_PAT(1) | L4_3_UC, >> + L3_LKUP(1) | L3_GLBGO(1) | L3_1_UC), >> + /* L4 - L3:NoLKUP; GO:L3 */ >> + MOCS_ENTRY(6, >> + IG_PAT(1), >> + L3_1_UC), >> + /* Uncached - L3:NoLKUP; GO:L3 */ >> + MOCS_ENTRY(7, >> + IG_PAT(1) | L4_3_UC, >> + L3_1_UC), >> + /* L4 - L3:NoLKUP; GO:Mem */ >> + MOCS_ENTRY(8, >> + IG_PAT(1), >> + L3_GLBGO(1) | L3_1_UC), >> + /* Uncached - L3:NoLKUP; GO:Mem */ >> + MOCS_ENTRY(9, >> + IG_PAT(1) | L4_3_UC, >> + L3_GLBGO(1) | L3_1_UC), >> + /* Display - L3; L4:WT */ >> + MOCS_ENTRY(14, >> + IG_PAT(1) | L4_1_WT, >> + L3_LKUP(1) | L3_3_WB), >> + /* CCS - Non-Displayable */ >> + MOCS_ENTRY(15, >> + IG_PAT(1), >> + L3_GLBGO(1) | L3_1_UC), >> +}; >> + >> enum { >> HAS_GLOBAL_MOCS = BIT(0), >> HAS_ENGINE_MOCS = BIT(1), >> @@ -445,7 +507,13 @@ static unsigned int get_mocs_settings(const struct drm_i915_private *i915, >> memset(table, 0, sizeof(struct drm_i915_mocs_table)); >> >> table->unused_entries_index = I915_MOCS_PTE; >> - if (IS_PONTEVECCHIO(i915)) { >> + if (IS_METEORLAKE(i915)) { >> + table->size = ARRAY_SIZE(mtl_mocs_table); >> + table->table = mtl_mocs_table; >> + table->n_entries = MTL_NUM_MOCS_ENTRIES; >> + table->uc_index = 9; >> + table->unused_entries_index = 1; >> + } else if (IS_PONTEVECCHIO(i915)) { >> table->size = ARRAY_SIZE(pvc_mocs_table); >> table->table = pvc_mocs_table; >> table->n_entries = PVC_NUM_MOCS_ENTRIES; @@ -646,9 +714,9 @@ void
>> intel_mocs_init_engine(struct intel_engine_cs *engine) >> init_l3cc_table(engine->gt, &table); } >> >> -static u32 global_mocs_offset(void) >> +static u32 global_mocs_offset(struct intel_gt *gt) >> { >> - return i915_mmio_reg_offset(GEN12_GLOBAL_MOCS(0)); >> + return i915_mmio_reg_offset(GEN12_GLOBAL_MOCS(0)) +
>> +gt->uncore->gsi_offset; > > Isn't this going to double-apply the GSI offset in __init_mocs_table? Good point. Not sure if there are test cases to validate this, but do need to double check. > We do need to handle the offset manually in read_mocs_table() since > those reads are done with an MI_STORE_REGISTER_MEM instruction. > > Handling the GSI offset properly deserves its own patch with a dedicated > commit message explaining the details. > >> } >> >> void intel_set_mocs_index(struct intel_gt *gt) @@ -671,7 +739,7 @@
>> void intel_mocs_init(struct intel_gt *gt) >> */ >> flags = get_mocs_settings(gt->i915, &table); >> if (flags & HAS_GLOBAL_MOCS) >> - __init_mocs_table(gt->uncore, &table, global_mocs_offset()); >> + __init_mocs_table(gt->uncore, &table, global_mocs_offset(gt)); >> >> /* >> * Initialize the L3CC table as part of mocs initalization to make
>> diff --git a/drivers/gpu/drm/i915/gt/selftest_mocs.c >> b/drivers/gpu/drm/i915/gt/selftest_mocs.c >> index ca009a6a13bd..730796346514 100644 >> --- a/drivers/gpu/drm/i915/gt/selftest_mocs.c >> +++ b/drivers/gpu/drm/i915/gt/selftest_mocs.c >> @@ -137,7 +137,7 @@ static int read_mocs_table(struct i915_request *rq, >> return 0; >> >> if (HAS_GLOBAL_MOCS_REGISTERS(rq->engine->i915)) >> - addr = global_mocs_offset(); >> + addr = global_mocs_offset(rq->engine->gt); >> else >> addr = mocs_offset(rq->engine); >> >> diff --git a/drivers/gpu/drm/i915/i915_pci.c >> b/drivers/gpu/drm/i915/i915_pci.c index cddb6e197972..025d32c0b161
>> 100644 >> --- a/drivers/gpu/drm/i915/i915_pci.c >> +++ b/drivers/gpu/drm/i915/i915_pci.c >> @@ -1146,6 +1146,7 @@ static const struct intel_device_info mtl_info = { >> .has_flat_ccs = 0, >> .has_gmd_id = 1, >> .has_guc_deprivilege = 1, >> + .has_llc = 0, > > This should also be in its own patch since it isn't directly related to the > MOCS / PAT tables. Will update in the next revision. > Matt > >> .has_mslice_steering = 0, >> .has_snoop = 1, >> .__runtime.memory_regions = REGION_SMEM | REGION_STOLEN_LMEM, >> -- >> 2.25.1 >> > > -- > Matt Roper > Graphics Software Engineer > Linux GPU Platform Enablement > Intel Corporation > |