On Fri, 21 Jan 2022, Adrian Larumbe <adrian.larumbe@xxxxxxxxxxxxx> wrote: > Because the smem-evicted bo that holds the CCS block has to be blitted > separately from the original compressed bo, two sets of PTEs have to > be emitted for every bo copy. > > This commit is partially based off another commit from Ramalingam C > <ramalingam.c@xxxxxxxxx>, currently under discussion. > > Signed-off-by: Adrian Larumbe <adrian.larumbe@xxxxxxxxxxxxx> > --- > drivers/gpu/drm/i915/gt/intel_migrate.c | 288 +++++++++++++++------ > drivers/gpu/drm/i915/gt/intel_migrate.h | 2 + > drivers/gpu/drm/i915/gt/selftest_migrate.c | 3 +- > 3 files changed, 207 insertions(+), 86 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c > index 716f2f51c7f9..da0fcc42c43c 100644 > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c > @@ -447,14 +447,183 @@ static bool wa_1209644611_applies(int ver, u32 size) > return height % 4 == 3 && height <= 8; > } > > +static inline u32 *i915_flush_dw(u32 *cmd, u64 dst, u32 flags) As a general rule, please don't use the inline keyword in .c files, just let the compiler decide. It's a premature optimization. And you won't get warnings if it's unused. BR, Jani. > +{ > + /* Mask the 3 LSB to use the PPGTT address space */ > + *cmd++ = MI_FLUSH_DW | flags; > + *cmd++ = lower_32_bits(dst); > + *cmd++ = upper_32_bits(dst); > + > + return cmd; > +} > + > +static u32 calc_ctrl_surf_instr_size(struct drm_i915_private *i915, int > +size) > +{ > + u32 num_cmds, num_blks, total_size; > + > + if (!GET_CCS_SIZE(i915, size)) > + return 0; > + > + /* > + * XY_CTRL_SURF_COPY_BLT transfers CCS in 256 byte > + * blocks. one XY_CTRL_SURF_COPY_BLT command can > + * trnasfer upto 1024 blocks. > + */ > + num_blks = (GET_CCS_SIZE(i915, size) + > + (NUM_CCS_BYTES_PER_BLOCK - 1)) >> 8; > + num_cmds = (num_blks + (NUM_CCS_BLKS_PER_XFER - 1)) >> 10; > + total_size = (XY_CTRL_SURF_INSTR_SIZE) * num_cmds; > + > + /* > + * We need to add a flush before and after > + * XY_CTRL_SURF_COPY_BLT > + */ > + total_size += 2 * MI_FLUSH_DW_SIZE; > + return total_size; > +} > + > +static u32 *_i915_ctrl_surf_copy_blt(u32 *cmd, u64 src_addr, u64 dst_addr, > + u8 src_mem_access, u8 dst_mem_access, > + int src_mocs, int dst_mocs, > + u16 num_ccs_blocks) > +{ > + int i = num_ccs_blocks; > + > + /* > + * The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS > + * data in and out of the CCS region. > + * > + * We can copy at most 1024 blocks of 256 bytes using one > + * XY_CTRL_SURF_COPY_BLT instruction. > + * > + * In case we need to copy more than 1024 blocks, we need to add > + * another instruction to the same batch buffer. > + * > + * 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS. > + * > + * 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM. > + */ > + do { > + /* > + * We use logical AND with 1023 since the size field > + * takes values which is in the range of 0 - 1023 > + */ > + *cmd++ = ((XY_CTRL_SURF_COPY_BLT) | > + (src_mem_access << SRC_ACCESS_TYPE_SHIFT) | > + (dst_mem_access << DST_ACCESS_TYPE_SHIFT) | > + (((i - 1) & 1023) << CCS_SIZE_SHIFT)); > + *cmd++ = lower_32_bits(src_addr); > + *cmd++ = ((upper_32_bits(src_addr) & 0xFFFF) | > + (src_mocs << XY_CTRL_SURF_MOCS_SHIFT)); > + *cmd++ = lower_32_bits(dst_addr); > + *cmd++ = ((upper_32_bits(dst_addr) & 0xFFFF) | > + (dst_mocs << XY_CTRL_SURF_MOCS_SHIFT)); > + src_addr += SZ_64M; > + dst_addr += SZ_64M; > + i -= NUM_CCS_BLKS_PER_XFER; > + } while (i > 0); > + > + return cmd; > +} > + > + > +static int emit_ccs(struct i915_request *rq, > + struct sgt_dma *it_lmem, > + enum i915_cache_level lmem_cache_level, > + struct sgt_dma *it_css, > + enum i915_cache_level css_cache_level, > + bool lmem2smem, > + int size) > +{ > + struct drm_i915_private *i915 = rq->engine->i915; > + u32 num_ccs_blks = (GET_CCS_SIZE(i915, size) + > + NUM_CCS_BYTES_PER_BLOCK - 1) >> 8; > + struct sgt_dma *it_src, *it_dst; > + enum i915_cache_level src_cache_level; > + enum i915_cache_level dst_cache_level; > + u8 src_access, dst_access; > + u32 src_offset, dst_offset; > + u32 ccs_ring_size; > + int err, len; > + u32 *cs; > + > + ccs_ring_size = calc_ctrl_surf_instr_size(i915, size); > + > + err = emit_no_arbitration(rq); > + if (err) > + return -EINVAL; > + > + src_offset = 0; > + dst_offset = CHUNK_SZ; > + if (HAS_64K_PAGES(i915)) { > + src_offset = 0; > + dst_offset = 0; > + if (lmem2smem) > + src_offset = CHUNK_SZ; > + else > + dst_offset = 2 * CHUNK_SZ; > + } > + > + if (lmem2smem) { > + it_src = it_lmem; > + it_dst = it_css; > + src_cache_level = lmem_cache_level; > + dst_cache_level = css_cache_level; > + } else { > + it_src = it_css; > + it_dst = it_lmem; > + src_cache_level = css_cache_level; > + dst_cache_level = lmem_cache_level; > + } > + > + len = emit_pte(rq, it_src, src_cache_level, > + lmem2smem ? true : false, > + src_offset, CHUNK_SZ); > + if (len <= 0) > + return len; > + > + err = emit_pte(rq, it_src, dst_cache_level, > + lmem2smem ? false : true, > + dst_offset, len); > + if (err < 0) > + return err; > + if (err < len) > + return -EINVAL; > + > + err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); > + if (err) > + return err; > + > + cs = intel_ring_begin(rq, round_up(ccs_ring_size, 2)); > + > + src_access = !lmem2smem; > + dst_access = !src_access; > + > + cs = _i915_ctrl_surf_copy_blt(cs, > + src_offset, > + dst_offset, > + src_access, > + dst_access, > + 1, 1, > + num_ccs_blks); > + cs = i915_flush_dw(cs, dst_offset, MI_FLUSH_LLC | MI_FLUSH_CCS); > + > + intel_ring_advance(rq, cs); > + > + return 0; > +} > + > static int emit_copy(struct i915_request *rq, > - u32 dst_offset, u32 src_offset, int size) > + bool dst_is_lmem, u32 dst_offset, > + bool src_is_lmem, u32 src_offset, > + int size) > { > const int ver = GRAPHICS_VER(rq->engine->i915); > u32 instance = rq->engine->instance; > u32 *cs; > > - cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6); > + cs = intel_ring_begin(rq, round_up(ver >= 8 ? 10 : 6, 2)); > if (IS_ERR(cs)) > return PTR_ERR(cs); > > @@ -503,6 +672,8 @@ intel_context_migrate_copy(struct intel_context *ce, > struct scatterlist *dst, > enum i915_cache_level dst_cache_level, > bool dst_is_lmem, > + struct scatterlist *css_blk, > + enum i915_cache_level css_cache_level, > struct i915_request **out) > { > struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst); > @@ -576,7 +747,31 @@ intel_context_migrate_copy(struct intel_context *ce, > if (err) > goto out_rq; > > - err = emit_copy(rq, dst_offset, src_offset, len); > + err = emit_copy(rq, dst_is_lmem, dst_offset, src_is_lmem, > + src_offset, len); > + > + if (HAS_FLAT_CCS(ce->engine->i915) && css_blk) { > + struct sgt_dma it_css_smem = sg_sgt(css_blk); > + enum i915_cache_level lmem_cache_level; > + struct sgt_dma *it_lmem; > + bool lmem2smem; > + > + if (dst_is_lmem) { > + it_lmem = &it_dst; > + lmem_cache_level = dst_cache_level; > + lmem2smem = false; > + } else { > + it_lmem = &it_src; > + lmem_cache_level = src_cache_level; > + lmem2smem = true; > + } > + > + err = emit_ccs(rq, it_lmem, lmem_cache_level, > + &it_css_smem, css_cache_level, > + lmem2smem, len); > + if (err) > + goto out_rq; > + } > > /* Arbitration is re-enabled between requests. */ > out_rq: > @@ -641,84 +836,6 @@ intel_context_migrate_copy(struct intel_context *ce, > * 4Kb tiles i.e Tile4 layout. > */ > > -static inline u32 *i915_flush_dw(u32 *cmd, u64 dst, u32 flags) > -{ > - /* Mask the 3 LSB to use the PPGTT address space */ > - *cmd++ = MI_FLUSH_DW | flags; > - *cmd++ = lower_32_bits(dst); > - *cmd++ = upper_32_bits(dst); > - > - return cmd; > -} > - > -static u32 calc_ctrl_surf_instr_size(struct drm_i915_private *i915, int size) > -{ > - u32 num_cmds, num_blks, total_size; > - > - if (!GET_CCS_SIZE(i915, size)) > - return 0; > - > - /* > - * XY_CTRL_SURF_COPY_BLT transfers CCS in 256 byte > - * blocks. one XY_CTRL_SURF_COPY_BLT command can > - * trnasfer upto 1024 blocks. > - */ > - num_blks = (GET_CCS_SIZE(i915, size) + > - (NUM_CCS_BYTES_PER_BLOCK - 1)) >> 8; > - num_cmds = (num_blks + (NUM_CCS_BLKS_PER_XFER - 1)) >> 10; > - total_size = (XY_CTRL_SURF_INSTR_SIZE) * num_cmds; > - > - /* > - * We need to add a flush before and after > - * XY_CTRL_SURF_COPY_BLT > - */ > - total_size += 2 * MI_FLUSH_DW_SIZE; > - return total_size; > -} > - > -static u32 *_i915_ctrl_surf_copy_blt(u32 *cmd, u64 src_addr, u64 dst_addr, > - u8 src_mem_access, u8 dst_mem_access, > - int src_mocs, int dst_mocs, > - u16 num_ccs_blocks) > -{ > - int i = num_ccs_blocks; > - > - /* > - * The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS > - * data in and out of the CCS region. > - * > - * We can copy at most 1024 blocks of 256 bytes using one > - * XY_CTRL_SURF_COPY_BLT instruction. > - * > - * In case we need to copy more than 1024 blocks, we need to add > - * another instruction to the same batch buffer. > - * > - * 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS. > - * > - * 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM. > - */ > - do { > - /* > - * We use logical AND with 1023 since the size field > - * takes values which is in the range of 0 - 1023 > - */ > - *cmd++ = ((XY_CTRL_SURF_COPY_BLT) | > - (src_mem_access << SRC_ACCESS_TYPE_SHIFT) | > - (dst_mem_access << DST_ACCESS_TYPE_SHIFT) | > - (((i - 1) & 1023) << CCS_SIZE_SHIFT)); > - *cmd++ = lower_32_bits(src_addr); > - *cmd++ = ((upper_32_bits(src_addr) & 0xFFFF) | > - (src_mocs << XY_CTRL_SURF_MOCS_SHIFT)); > - *cmd++ = lower_32_bits(dst_addr); > - *cmd++ = ((upper_32_bits(dst_addr) & 0xFFFF) | > - (dst_mocs << XY_CTRL_SURF_MOCS_SHIFT)); > - src_addr += SZ_64M; > - dst_addr += SZ_64M; > - i -= NUM_CCS_BLKS_PER_XFER; > - } while (i > 0); > - > - return cmd; > -} > > static int emit_clear(struct i915_request *rq, > u64 offset, > @@ -740,7 +857,7 @@ static int emit_clear(struct i915_request *rq, > calc_ctrl_surf_instr_size(i915, size) > : 0; > > - cs = intel_ring_begin(rq, ver >= 8 ? 8 + ccs_ring_size : 6); > + cs = intel_ring_begin(rq, round_up(ver >= 8 ? 8 + ccs_ring_size : 6, 2)); > if (IS_ERR(cs)) > return PTR_ERR(cs); > > @@ -764,8 +881,7 @@ static int emit_clear(struct i915_request *rq, > } > > if (is_lmem && HAS_FLAT_CCS(i915) && !value) { > - num_ccs_blks = (GET_CCS_SIZE(i915, size) + > - NUM_CCS_BYTES_PER_BLOCK - 1) >> 8; > + num_ccs_blks = GET_CCS_SIZE(i915, size); > /* > * Flat CCS surface can only be accessed via > * XY_CTRL_SURF_COPY_BLT CMD and using indirect > @@ -784,6 +900,8 @@ static int emit_clear(struct i915_request *rq, > 1, 1, > num_ccs_blks); > cs = i915_flush_dw(cs, offset, MI_FLUSH_LLC | MI_FLUSH_CCS); > + if (ccs_ring_size & 1) > + *cs++ = MI_NOOP; > } > intel_ring_advance(rq, cs); > return 0; > @@ -898,7 +1016,7 @@ int intel_migrate_copy(struct intel_migrate *m, > err = intel_context_migrate_copy(ce, deps, > src, src_cache_level, src_is_lmem, > dst, dst_cache_level, dst_is_lmem, > - out); > + NULL, I915_CACHE_NONE, out); > > intel_context_unpin(ce); > out: > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.h b/drivers/gpu/drm/i915/gt/intel_migrate.h > index ccc677ec4aa3..dce63a0dba33 100644 > --- a/drivers/gpu/drm/i915/gt/intel_migrate.h > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.h > @@ -41,6 +41,8 @@ int intel_context_migrate_copy(struct intel_context *ce, > struct scatterlist *dst, > enum i915_cache_level dst_cache_level, > bool dst_is_lmem, > + struct scatterlist *cssblk, > + enum i915_cache_level css_cache_level, > struct i915_request **out); > > int > diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c b/drivers/gpu/drm/i915/gt/selftest_migrate.c > index fa4293d2944f..2a2fa6186e31 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_migrate.c > +++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c > @@ -231,7 +231,7 @@ static int __global_copy(struct intel_migrate *migrate, > i915_gem_object_is_lmem(src), > dst->mm.pages->sgl, dst->cache_level, > i915_gem_object_is_lmem(dst), > - out); > + NULL, I915_CACHE_NONE, out); > } > > static int > @@ -582,6 +582,7 @@ static int __perf_copy_blt(struct intel_context *ce, > src_is_lmem, > dst, dst_cache_level, > dst_is_lmem, > + NULL, I915_CACHE_NONE, > &rq); > if (rq) { > if (i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT) < 0) -- Jani Nikula, Intel Open Source Graphics Center