On 2022-03-21 at 16:09:08 +0530, Hellstrom, Thomas wrote: > On Sun, 2022-03-20 at 02:12 +0530, Ramalingam C wrote: > > While clearing the Flat-CCS capable lmem object, we need to clear the > > CCS > > meta data corresponding to the memory. > > > > As part of live_migrate_clear add check for the ccs meta data clear > > for > > the Flat-CCS capable lmem object. > > > > Signed-off-by: Ramalingam C <ramalingam.c@xxxxxxxxx> > > --- > > drivers/gpu/drm/i915/gt/intel_migrate.c | 32 +++ > > drivers/gpu/drm/i915/gt/selftest_migrate.c | 274 ++++++++++++++++++- > > -- > > 2 files changed, 278 insertions(+), 28 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c > > b/drivers/gpu/drm/i915/gt/intel_migrate.c > > index c1db8daf994a..bbfea570c239 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c > > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c > > @@ -572,6 +572,38 @@ static u32 *_i915_ctrl_surf_copy_blt(u32 *cmd, > > u64 src_addr, u64 dst_addr, > > return cmd; > > } > > > > +static int emit_copy_ccs(struct i915_request *rq, > > + u32 dst_offset, u8 dst_access, > > + u32 src_offset, u8 src_access, int size) > > +{ > > + struct drm_i915_private *i915 = rq->engine->i915; > > + int mocs = rq->engine->gt->mocs.uc_index << 1; > > + u32 num_ccs_blks, ccs_ring_size; > > + u32 *cs; > > + > > + ccs_ring_size = calc_ctrl_surf_instr_size(i915, size); > > + WARN_ON(!ccs_ring_size); > > + > > + cs = intel_ring_begin(rq, round_up(ccs_ring_size, 2)); > > + if (IS_ERR(cs)) > > + return PTR_ERR(cs); > > + > > + num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size), > > + NUM_CCS_BYTES_PER_BLOCK); > > + > > + cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS); > > + cs = _i915_ctrl_surf_copy_blt(cs, src_offset, dst_offset, > > + src_access, dst_access, > > + mocs, mocs, num_ccs_blks); > > + cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS); > > + if (ccs_ring_size & 1) > > + *cs++ = MI_NOOP; > > + > > + intel_ring_advance(rq, cs); > > + > > + return 0; > > +} > > > This would be an unused function if selftests are not configured, > right? No Thomas. This is reused between selftest and eviction flow. in next version i am reusing it for evict_clear too. > > > > + > > static int emit_copy(struct i915_request *rq, > > u32 dst_offset, u32 src_offset, int size) > > { > > diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c > > b/drivers/gpu/drm/i915/gt/selftest_migrate.c > > index b5da8b8cd039..e32cc994f4a2 100644 > > --- a/drivers/gpu/drm/i915/gt/selftest_migrate.c > > +++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c > > @@ -132,6 +132,126 @@ static int copy(struct intel_migrate *migrate, > > return err; > > } > > > > +static int intel_context_copy_ccs(struct intel_context *ce, > > + const struct i915_deps *deps, > > + struct scatterlist *sg, > > + enum i915_cache_level cache_level, > > + bool write_to_ccs, > > + struct i915_request **out) > > +{ > > + u8 src_access = write_to_ccs ? DIRECT_ACCESS : > > INDIRECT_ACCESS; > > + u8 dst_access = write_to_ccs ? INDIRECT_ACCESS : > > DIRECT_ACCESS; > > + struct sgt_dma it = sg_sgt(sg); > > + struct i915_request *rq; > > + u32 offset; > > + int err; > > + > > + GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm); > > + *out = NULL; > > + > > + GEM_BUG_ON(ce->ring->size < SZ_64K); > > + > > + offset = 0; > > + if (HAS_64K_PAGES(ce->engine->i915)) > > + offset = CHUNK_SZ; > > + offset += (u64)rq->engine->instance << 32; > > + > > + do { > > + int len; > > + > > + rq = i915_request_create(ce); > > + if (IS_ERR(rq)) { > > + err = PTR_ERR(rq); > > + goto out_ce; > > + } > > + > > + if (deps) { > > + err = i915_request_await_deps(rq, deps); > > + if (err) > > + goto out_rq; > > + > > + if (rq->engine->emit_init_breadcrumb) { > > + err = rq->engine- > > >emit_init_breadcrumb(rq); > > + if (err) > > + goto out_rq; > > + } > > + > > + deps = NULL; > > + } > > + > > + /* The PTE updates + clear must not be interrupted. > > */ > > + err = emit_no_arbitration(rq); > > + if (err) > > + goto out_rq; > > + > > + len = emit_pte(rq, &it, cache_level, true, offset, > > CHUNK_SZ); > > + if (len <= 0) { > > + err = len; > > + goto out_rq; > > + } > > + > > + err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); > > + if (err) > > + goto out_rq; > > + > > + err = emit_copy_ccs(rq, offset, dst_access, > > + offset, src_access, len); > > + if (err) > > + goto out_rq; > > + > > + err = rq->engine->emit_flush(rq, EMIT_INVALIDATE | > > + MI_FLUSH_DW_CCS); > > + > > + /* Arbitration is re-enabled between requests. */ > > +out_rq: > > + if (*out) > > + i915_request_put(*out); > > + *out = i915_request_get(rq); > > + i915_request_add(rq); > > + if (err || !it.sg || !sg_dma_len(it.sg)) > > + break; > > + > > + cond_resched(); > > + } while (1); > > + > > +out_ce: > > + return err; > > +} > > + > > +static int > > +intel_migrate_ccs_copy(struct intel_migrate *m, > > + struct i915_gem_ww_ctx *ww, > > + const struct i915_deps *deps, > > + struct scatterlist *sg, > > + enum i915_cache_level cache_level, > > + bool write_to_ccs, > > + struct i915_request **out) > > +{ > > + struct intel_context *ce; > > + int err; > > + > > + *out = NULL; > > + if (!m->context) > > + return -ENODEV; > > + > > + ce = intel_migrate_create_context(m); > > + if (IS_ERR(ce)) > > + ce = intel_context_get(m->context); > > + GEM_BUG_ON(IS_ERR(ce)); > > + > > + err = intel_context_pin_ww(ce, ww); > > + if (err) > > + goto out; > > + > > + err = intel_context_copy_ccs(ce, deps, sg, cache_level, > > + write_to_ccs, out); > > + > > + intel_context_unpin(ce); > > +out: > > + intel_context_put(ce); > > + return err; > > +} > > + > > static int clear(struct intel_migrate *migrate, > > int (*fn)(struct intel_migrate *migrate, > > struct i915_gem_ww_ctx *ww, > > @@ -144,7 +264,8 @@ static int clear(struct intel_migrate *migrate, > > struct drm_i915_gem_object *obj; > > struct i915_request *rq; > > struct i915_gem_ww_ctx ww; > > - u32 *vaddr; > > + u32 *vaddr, val = 0; > > + bool ccs_cap = false; > > int err = 0; > > int i; > > > > @@ -155,7 +276,12 @@ static int clear(struct intel_migrate *migrate, > > /* Consider the rounded up memory too */ > > sz = obj->base.size; > > > > + if (HAS_FLAT_CCS(i915) && i915_gem_object_is_lmem(obj)) > > + ccs_cap = true; > > + > > for_i915_gem_ww(&ww, err, true) { > > + int ccs_bytes; > > + > > err = i915_gem_object_lock(obj, &ww); > > if (err) > > continue; > > @@ -170,44 +296,136 @@ static int clear(struct intel_migrate > > *migrate, > > vaddr[i] = ~i; > > i915_gem_object_flush_map(obj); > > > > - err = fn(migrate, &ww, obj, sz, &rq); > > - if (!err) > > - continue; > > + if (ccs_cap && !val) { > > + /* Write the obj data into ccs surface */ > > + err = intel_migrate_ccs_copy(migrate, &ww, > > NULL, > > + obj->mm.pages- > > >sgl, > > + obj- > > >cache_level, > > + true, &rq); > > + if (rq && !err) { > > + if (i915_request_wait(rq, 0, HZ) < 0) > > { > > + pr_err("%ps timed out, size: > > %u\n", > > + fn, sz); > > + err = -ETIME; > > + } > > + i915_request_put(rq); > > + rq = NULL; > > + } > > + if (err) > > + continue; > > + > > + for (i = 0; i < sz / sizeof(u32); i++) > > + vaddr[i] = 0x5a5a5a5a; > > + i915_gem_object_flush_map(obj); > > + > > + err = intel_migrate_ccs_copy(migrate, &ww, > > NULL, obj->mm.pages->sgl, > > + obj- > > >cache_level, false, &rq); > > Why do we read back CCS content here? Was rechecking the ccs copy. but this is not needed for real intention. Removing in next version. > > > + if (rq && !err) { > > + if (i915_request_wait(rq, 0, HZ) < 0) > > { > > + pr_err("%ps timed out, size: > > %u\n", > > + fn, sz); > > + err = -ETIME; > > + } > > + i915_request_put(rq); > > + rq = NULL; > > + } > > + if (err) > > + continue; > > + > > + i915_gem_object_flush_map(obj); > > + for (i = 0; !err && i < ccs_bytes; i += 4) { > > + if (vaddr[i] != ~i) { > > + pr_err("%ps ccs write and > > read failed, offset: %d\n", > > + fn, i); > > + err = -EINVAL; > > + } > > + } > > + if (err) > > + continue; > > + > > + i915_gem_object_flush_map(obj); > > + } > > > > - if (err != -EDEADLK && err != -EINTR && err != - > > ERESTARTSYS) > > - pr_err("%ps failed, size: %u\n", fn, sz); > > - if (rq) { > > - i915_request_wait(rq, 0, HZ); > > + err = fn(migrate, &ww, obj, val, &rq); > > + if (rq && !err) { > > + if (i915_request_wait(rq, 0, HZ) < 0) { > > + pr_err("%ps timed out, size: %u\n", > > fn, sz); > > + err = -ETIME; > > + } > > i915_request_put(rq); > > + rq = NULL; > > } > > - i915_gem_object_unpin_map(obj); > > - } > > - if (err) > > - goto err_out; > > + if (err) > > + continue; > > > > - if (rq) { > > - if (i915_request_wait(rq, 0, HZ) < 0) { > > - pr_err("%ps timed out, size: %u\n", fn, sz); > > - err = -ETIME; > > + i915_gem_object_flush_map(obj); > > + > > + /* Verify the set/clear of the obj mem */ > > + for (i = 0; !err && i < sz / PAGE_SIZE; i++) { > > + int x = i * 1024 + > > + i915_prandom_u32_max_state(1024, > > prng); > > + > > + if (vaddr[x] != val) { > > + pr_err("%ps failed, (%u != %u), > > offset: %zu\n", > > + fn, vaddr[x], val, x * > > sizeof(u32)); > > + igt_hexdump(vaddr + i * 1024, 4096); > > + err = -EINVAL; > > + } > > } > > - i915_request_put(rq); > > - } > > + if (err) > > + continue; > > > > - for (i = 0; !err && i < sz / PAGE_SIZE; i++) { > > - int x = i * 1024 + i915_prandom_u32_max_state(1024, > > prng); > > + if (ccs_cap && !val) { > > + for (i = 0; i < sz / sizeof(u32); i++) > > + vaddr[i] = ~i; > > + i915_gem_object_flush_map(obj); > > + > > + err = intel_migrate_ccs_copy(migrate, &ww, > > NULL, > > + obj->mm.pages- > > >sgl, > > + obj- > > >cache_level, > > + false, &rq); > > + if (rq && !err) { > > + if (i915_request_wait(rq, 0, HZ) < 0) > > { > > + pr_err("%ps timed out, size: > > %u\n", > > + fn, sz); > > + err = -ETIME; > > + } > > + i915_request_put(rq); > > + rq = NULL; > > + } > > + if (err) > > + continue; > > + > > + ccs_bytes = GET_CCS_BYTES(i915, sz); > > + i915_gem_object_flush_map(obj); > > + for (i = 0; !err && i < ccs_bytes / > > sizeof(u32); i++) { > > + if (vaddr[i]) { > > I think this is incorrect. This assumes that CCS data is read back > contiguous for the whole buffer, but instead CCS data is read back > per 8MiB chunk and placed at the beginning of each chunk? Yes. This is the source for the problem I was discussing with you. Fixed it in the next version. Please share your feedback. could have used a dedicated obj for ccs, but just calculated the offset of the ccs bytes. Ram > > /Thomas > > > > > + pr_err("%ps ccs clearing > > failed, offset: %d/%lu\n", > > + fn, i, (ccs_bytes / > > sizeof(u32)) - 1); > > + igt_hexdump(vaddr + i, > > ccs_bytes - i * sizeof(u32)); > > + err = -EINVAL; > > + } > > + } > > + if (err) > > + continue; > > + } > > + i915_gem_object_unpin_map(obj); > > + } > > > > - if (vaddr[x] != sz) { > > - pr_err("%ps failed, size: %u, offset: %zu\n", > > - fn, sz, x * sizeof(u32)); > > - igt_hexdump(vaddr + i * 1024, 4096); > > - err = -EINVAL; > > + if (err) { > > + if (err != -EDEADLK && err != -EINTR && err != - > > ERESTARTSYS) > > + pr_err("%ps failed, size: %u\n", fn, sz); > > + if (rq && err != -EINVAL) { > > + i915_request_wait(rq, 0, HZ); > > + i915_request_put(rq); > > } > > + > > + i915_gem_object_unpin_map(obj); > > + } else { > > + pr_debug("%ps Passed. size: %u\n", fn, sz); > > } > > > > - i915_gem_object_unpin_map(obj); > > -err_out: > > i915_gem_object_put(obj); > > - > > return err; > > } > > >