Hi, Matt On Tue, 2024-07-02 at 16:06 +0100, Matthew Auld wrote: > This involves enabling l2 caching of host side memory access to VRAM > through the CPU BAR. The main fallout here is with display since VRAM > writes from CPU can now be cached in GPU l2, and display is never > coherent with caches, so needs various manual flushing. In the case > of > fbc we disable it due to complications in getting this to work > correctly (in a later patch). What about user-space accesses to framebuffers? /Thomas > > Signed-off-by: Matthew Auld <matthew.auld@xxxxxxxxx> > Cc: Jonathan Cavitt <jonathan.cavitt@xxxxxxxxx> > Cc: Matt Roper <matthew.d.roper@xxxxxxxxx> > Cc: Lucas De Marchi <lucas.demarchi@xxxxxxxxx> > Cc: Vinod Govindapillai <vinod.govindapillai@xxxxxxxxx> > Reviewed-by: Jonathan Cavitt <jonathan.cavitt@xxxxxxxxx> > --- > drivers/gpu/drm/xe/Makefile | 2 + > drivers/gpu/drm/xe/display/xe_dsb_buffer.c | 8 ++++ > drivers/gpu/drm/xe/display/xe_fb_pin.c | 3 ++ > drivers/gpu/drm/xe/regs/xe_gt_regs.h | 8 ++++ > drivers/gpu/drm/xe/xe_device.c | 30 ++++++++++++ > drivers/gpu/drm/xe/xe_device.h | 1 + > drivers/gpu/drm/xe/xe_gt.c | 54 > ++++++++++++++++++++++ > drivers/gpu/drm/xe/xe_pat.c | 11 ++++- > drivers/gpu/drm/xe/xe_wa_oob.rules | 1 + > 9 files changed, 117 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/xe/Makefile > b/drivers/gpu/drm/xe/Makefile > index b1e03bfe4a68..970c5c09e20a 100644 > --- a/drivers/gpu/drm/xe/Makefile > +++ b/drivers/gpu/drm/xe/Makefile > @@ -25,12 +25,14 @@ $(obj)/generated/%_wa_oob.c > $(obj)/generated/%_wa_oob.h: $(obj)/xe_gen_wa_oob \ > > uses_generated_oob := \ > $(obj)/xe_ggtt.o \ > + $(obj)/xe_device.o \ > $(obj)/xe_gsc.o \ > $(obj)/xe_gt.o \ > $(obj)/xe_guc.o \ > $(obj)/xe_guc_ads.o \ > $(obj)/xe_guc_pc.o \ > $(obj)/xe_migrate.o \ > + $(obj)/xe_pat.o \ > $(obj)/xe_ring_ops.o \ > $(obj)/xe_vm.o \ > $(obj)/xe_wa.o \ > diff --git a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c > b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c > index 9e860c61f4b3..ccd0d87d438a 100644 > --- a/drivers/gpu/drm/xe/display/xe_dsb_buffer.c > +++ b/drivers/gpu/drm/xe/display/xe_dsb_buffer.c > @@ -7,6 +7,8 @@ > #include "intel_display_types.h" > #include "intel_dsb_buffer.h" > #include "xe_bo.h" > +#include "xe_device.h" > +#include "xe_device_types.h" > #include "xe_gt.h" > > u32 intel_dsb_buffer_ggtt_offset(struct intel_dsb_buffer *dsb_buf) > @@ -16,7 +18,10 @@ u32 intel_dsb_buffer_ggtt_offset(struct > intel_dsb_buffer *dsb_buf) > > void intel_dsb_buffer_write(struct intel_dsb_buffer *dsb_buf, u32 > idx, u32 val) > { > + struct xe_device *xe = dsb_buf->vma->bo->tile->xe; > + > iosys_map_wr(&dsb_buf->vma->bo->vmap, idx * 4, u32, val); > + xe_device_l2_flush(xe); > } > > u32 intel_dsb_buffer_read(struct intel_dsb_buffer *dsb_buf, u32 idx) > @@ -26,9 +31,12 @@ u32 intel_dsb_buffer_read(struct intel_dsb_buffer > *dsb_buf, u32 idx) > > void intel_dsb_buffer_memset(struct intel_dsb_buffer *dsb_buf, u32 > idx, u32 val, size_t size) > { > + struct xe_device *xe = dsb_buf->vma->bo->tile->xe; > + > WARN_ON(idx > (dsb_buf->buf_size - size) / sizeof(*dsb_buf- > >cmd_buf)); > > iosys_map_memset(&dsb_buf->vma->bo->vmap, idx * 4, val, > size); > + xe_device_l2_flush(xe); > } > > bool intel_dsb_buffer_create(struct intel_crtc *crtc, struct > intel_dsb_buffer *dsb_buf, size_t size) > diff --git a/drivers/gpu/drm/xe/display/xe_fb_pin.c > b/drivers/gpu/drm/xe/display/xe_fb_pin.c > index 423f367c7065..d7db44e79eaf 100644 > --- a/drivers/gpu/drm/xe/display/xe_fb_pin.c > +++ b/drivers/gpu/drm/xe/display/xe_fb_pin.c > @@ -10,6 +10,7 @@ > #include "intel_fb.h" > #include "intel_fb_pin.h" > #include "xe_bo.h" > +#include "xe_device.h" > #include "xe_ggtt.h" > #include "xe_gt.h" > #include "xe_pm.h" > @@ -304,6 +305,8 @@ static struct i915_vma *__xe_pin_fb_vma(const > struct intel_framebuffer *fb, > if (ret) > goto err_unpin; > > + /* Ensure DPT writes are flushed */ > + xe_device_l2_flush(xe); > return vma; > > err_unpin: > diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h > b/drivers/gpu/drm/xe/regs/xe_gt_regs.h > index d44564bad009..fd9d94174efb 100644 > --- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h > +++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h > @@ -80,6 +80,9 @@ > #define LE_CACHEABILITY_MASK REG_GENMASK(1, 0) > #define > LE_CACHEABILITY(value) REG_FIELD_PREP(LE_CACHEABILITY_MASK, value) > > +#define XE2_GAMREQSTRM_CTRL XE_REG(0x4194) > +#define CG_DIS_CNTLBUS REG_BIT(6) > + > #define CCS_AUX_INV XE_REG(0x4208) > > #define VD0_AUX_INV XE_REG(0x4218) > @@ -372,6 +375,11 @@ > > #define XEHPC_L3CLOS_MASK(i) XE_REG_MCR(0xb194 + > (i) * 8) > > +#define XE2_GLOBAL_INVAL XE_REG(0xb404) > + > +#define SCRATCH1LPFC XE_REG(0xb474) > +#define EN_L3_RW_CCS_CACHE_FLUSH REG_BIT(0) > + > #define XE2LPM_L3SQCREG5 XE_REG_MCR(0xb658) > > #define XE2_TDF_CTRL XE_REG(0xb418) > diff --git a/drivers/gpu/drm/xe/xe_device.c > b/drivers/gpu/drm/xe/xe_device.c > index cfda7cb5df2c..b0f79ef6bce1 100644 > --- a/drivers/gpu/drm/xe/xe_device.c > +++ b/drivers/gpu/drm/xe/xe_device.c > @@ -54,6 +54,9 @@ > #include "xe_vm.h" > #include "xe_vram.h" > #include "xe_wait_user_fence.h" > +#include "xe_wa.h" > + > +#include <generated/xe_wa_oob.h> > > static int xe_file_open(struct drm_device *dev, struct drm_file > *file) > { > @@ -779,6 +782,11 @@ void xe_device_td_flush(struct xe_device *xe) > if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20) > return; > > + if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) { > + xe_device_l2_flush(xe); > + return; > + } > + > for_each_gt(gt, xe, id) { > if (xe_gt_is_media_type(gt)) > continue; > @@ -802,6 +810,28 @@ void xe_device_td_flush(struct xe_device *xe) > } > } > > +void xe_device_l2_flush(struct xe_device *xe) > +{ > + struct xe_gt *gt; > + int err; > + > + gt = xe_root_mmio_gt(xe); > + > + if (!XE_WA(gt, 16023588340)) > + return; > + > + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); > + if (err) > + return; > + > + xe_mmio_write32(gt, XE2_GLOBAL_INVAL, 0x1); > + > + if (xe_mmio_wait32(gt, XE2_GLOBAL_INVAL, 0x1, 0x0, 150, > NULL, true)) > + xe_gt_err_once(gt, "Global invalidation timeout\n"); > + > + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); > +} > + > u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size) > { > return xe_device_has_flat_ccs(xe) ? > diff --git a/drivers/gpu/drm/xe/xe_device.h > b/drivers/gpu/drm/xe/xe_device.h > index bb07f5669dbb..0a2a3e7fd402 100644 > --- a/drivers/gpu/drm/xe/xe_device.h > +++ b/drivers/gpu/drm/xe/xe_device.h > @@ -162,6 +162,7 @@ u64 xe_device_canonicalize_addr(struct xe_device > *xe, u64 address); > u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 > address); > > void xe_device_td_flush(struct xe_device *xe); > +void xe_device_l2_flush(struct xe_device *xe); > > static inline bool xe_device_wedged(struct xe_device *xe) > { > diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c > index 29e8ea94d05e..006d3594ba55 100644 > --- a/drivers/gpu/drm/xe/xe_gt.c > +++ b/drivers/gpu/drm/xe/xe_gt.c > @@ -11,6 +11,8 @@ > #include <drm/xe_drm.h> > #include <generated/xe_wa_oob.h> > > +#include <generated/xe_wa_oob.h> > + > #include "instructions/xe_gfxpipe_commands.h" > #include "instructions/xe_mi_commands.h" > #include "regs/xe_gt_regs.h" > @@ -95,6 +97,51 @@ void xe_gt_sanitize(struct xe_gt *gt) > gt->uc.guc.submission_state.enabled = false; > } > > +static void xe_gt_enable_host_l2_vram(struct xe_gt *gt) > +{ > + u32 reg; > + int err; > + > + if (!XE_WA(gt, 16023588340)) > + return; > + > + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); > + if (WARN_ON(err)) > + return; > + > + if (!xe_gt_is_media_type(gt)) { > + xe_mmio_write32(gt, SCRATCH1LPFC, > EN_L3_RW_CCS_CACHE_FLUSH); > + reg = xe_mmio_read32(gt, XE2_GAMREQSTRM_CTRL); > + reg |= CG_DIS_CNTLBUS; > + xe_mmio_write32(gt, XE2_GAMREQSTRM_CTRL, reg); > + } > + > + xe_gt_mcr_multicast_write(gt, XEHPC_L3CLOS_MASK(3), 0x3); > + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); > +} > + > +static void xe_gt_disable_host_l2_vram(struct xe_gt *gt) > +{ > + u32 reg; > + int err; > + > + if (!XE_WA(gt, 16023588340)) > + return; > + > + if (xe_gt_is_media_type(gt)) > + return; > + > + err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); > + if (WARN_ON(err)) > + return; > + > + reg = xe_mmio_read32(gt, XE2_GAMREQSTRM_CTRL); > + reg &= ~CG_DIS_CNTLBUS; > + xe_mmio_write32(gt, XE2_GAMREQSTRM_CTRL, reg); > + > + xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); > +} > + > /** > * xe_gt_remove() - Clean up the GT structures before driver removal > * @gt: the GT object > @@ -111,6 +158,8 @@ void xe_gt_remove(struct xe_gt *gt) > > for (i = 0; i < XE_ENGINE_CLASS_MAX; ++i) > xe_hw_fence_irq_finish(>->fence_irq[i]); > + > + xe_gt_disable_host_l2_vram(gt); > } > > static void gt_reset_worker(struct work_struct *w); > @@ -508,6 +557,7 @@ int xe_gt_init_hwconfig(struct xe_gt *gt) > > xe_gt_mcr_init_early(gt); > xe_pat_init(gt); > + xe_gt_enable_host_l2_vram(gt); > > err = xe_uc_init(>->uc); > if (err) > @@ -643,6 +693,8 @@ static int do_gt_restart(struct xe_gt *gt) > > xe_pat_init(gt); > > + xe_gt_enable_host_l2_vram(gt); > + > xe_gt_mcr_set_implicit_defaults(gt); > xe_reg_sr_apply_mmio(>->reg_sr, gt); > > @@ -796,6 +848,8 @@ int xe_gt_suspend(struct xe_gt *gt) > > xe_gt_idle_disable_pg(gt); > > + xe_gt_disable_host_l2_vram(gt); > + > XE_WARN_ON(xe_force_wake_put(gt_to_fw(gt), > XE_FORCEWAKE_ALL)); > xe_gt_dbg(gt, "suspended\n"); > > diff --git a/drivers/gpu/drm/xe/xe_pat.c > b/drivers/gpu/drm/xe/xe_pat.c > index 4ee32ee1cc88..722278cc23fc 100644 > --- a/drivers/gpu/drm/xe/xe_pat.c > +++ b/drivers/gpu/drm/xe/xe_pat.c > @@ -7,6 +7,8 @@ > > #include <drm/xe_drm.h> > > +#include <generated/xe_wa_oob.h> > + > #include "regs/xe_reg_defs.h" > #include "xe_assert.h" > #include "xe_device.h" > @@ -15,6 +17,7 @@ > #include "xe_gt_mcr.h" > #include "xe_mmio.h" > #include "xe_sriov.h" > +#include "xe_wa.h" > > #define _PAT_ATS 0x47fc > #define > _PAT_INDEX(index) _PICK_EVEN_2RANGES(index, 8, \ > @@ -382,7 +385,13 @@ void xe_pat_init_early(struct xe_device *xe) > if (GRAPHICS_VER(xe) == 20) { > xe->pat.ops = &xe2_pat_ops; > xe->pat.table = xe2_pat_table; > - xe->pat.n_entries = ARRAY_SIZE(xe2_pat_table); > + > + /* Wa_16023588340. XXX: Should use XE_WA */ > + if (GRAPHICS_VERx100(xe) == 2001) > + xe->pat.n_entries = 28; /* Disable CLOS3 */ > + else > + xe->pat.n_entries = > ARRAY_SIZE(xe2_pat_table); > + > xe->pat.idx[XE_CACHE_NONE] = 3; > xe->pat.idx[XE_CACHE_WT] = 15; > xe->pat.idx[XE_CACHE_WB] = 2; > diff --git a/drivers/gpu/drm/xe/xe_wa_oob.rules > b/drivers/gpu/drm/xe/xe_wa_oob.rules > index a6b897030fde..c6d8941621c6 100644 > --- a/drivers/gpu/drm/xe/xe_wa_oob.rules > +++ b/drivers/gpu/drm/xe/xe_wa_oob.rules > @@ -28,3 +28,4 @@ > GRAPHICS_VERSION(2004) > 13011645652 GRAPHICS_VERSION(2004) > 22019338487 MEDIA_VERSION(2000) > +16023588340 GRAPHICS_VERSION(2001)