Thanks, Oak > -----Original Message----- > From: Das, Nirmoy <nirmoy.das@xxxxxxxxx> > Sent: Monday, September 18, 2023 6:26 AM > To: Zeng, Oak <oak.zeng@xxxxxxxxx>; intel-gfx@xxxxxxxxxxxxxxxxxxxxx; > Piorkowski, Piotr <piotr.piorkowski@xxxxxxxxx> > Cc: chris.p.wilson@xxxxxxxxxxxxxxx; Shyti, Andi <andi.shyti@xxxxxxxxx>; Mun, > Gwan-gyeong <gwan-gyeong.mun@xxxxxxxxx>; Roper, Matthew D > <matthew.d.roper@xxxxxxxxx> > Subject: Re: [PATCH 5/7] drm/i915: Implement GGTT update method with > MI_UPDATE_GTT > > > On 9/15/2023 5:56 PM, Zeng, Oak wrote: > > > > Thanks, > > Oak > > > >> -----Original Message----- > >> From: Das, Nirmoy <nirmoy.das@xxxxxxxxx> > >> Sent: Friday, September 15, 2023 4:34 AM > >> To: intel-gfx@xxxxxxxxxxxxxxxxxxxxx > >> Cc: Zeng, Oak <oak.zeng@xxxxxxxxx>; chris.p.wilson@xxxxxxxxxxxxxxx; > Piorkowski, > >> Piotr <piotr.piorkowski@xxxxxxxxx>; Shyti, Andi <andi.shyti@xxxxxxxxx>; Mun, > >> Gwan-gyeong <gwan-gyeong.mun@xxxxxxxxx>; Roper, Matthew D > >> <matthew.d.roper@xxxxxxxxx>; Das, Nirmoy <nirmoy.das@xxxxxxxxx> > >> Subject: [PATCH 5/7] drm/i915: Implement GGTT update method with > >> MI_UPDATE_GTT > >> > >> Implement GGTT update method with blitter command, MI_UPDATE_GTT > >> and install those handlers if a platform requires that. > >> > >> v2: Make sure we hold the GT wakeref and Blitter engine wakeref before > >> we call mutex_lock/intel_context_enter below. When GT/engine are not > >> awake, the intel_context_enter calls into some runtime pm function which > >> can end up with kmalloc/fs_reclaim. But trigger fs_reclaim holding a > >> mutex lock is not allowed because shrinker can also try to hold the same > >> mutex lock. It is a circular lock. So hold the GT/blitter engine wakeref > >> before calling mutex_lock, to fix the circular lock. > >> > >> Signed-off-by: Nirmoy Das <nirmoy.das@xxxxxxxxx> > >> Signed-off-by: Oak Zeng <oak.zeng@xxxxxxxxx> > >> --- > >> drivers/gpu/drm/i915/gt/intel_ggtt.c | 235 > +++++++++++++++++++++++++++ > >> 1 file changed, 235 insertions(+) > >> > >> diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c > >> b/drivers/gpu/drm/i915/gt/intel_ggtt.c > >> index dd0ed941441a..b94de7cebfce 100644 > >> --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c > >> +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c > >> @@ -15,18 +15,23 @@ > >> #include "display/intel_display.h" > >> #include "gem/i915_gem_lmem.h" > >> > >> +#include "intel_context.h" > >> #include "intel_ggtt_gmch.h" > >> +#include "intel_gpu_commands.h" > >> #include "intel_gt.h" > >> #include "intel_gt_regs.h" > >> #include "intel_pci_config.h" > >> +#include "intel_ring.h" > >> #include "i915_drv.h" > >> #include "i915_pci.h" > >> +#include "i915_request.h" > >> #include "i915_scatterlist.h" > >> #include "i915_utils.h" > >> #include "i915_vgpu.h" > >> > >> #include "intel_gtt.h" > >> #include "gen8_ppgtt.h" > >> +#include "intel_engine_pm.h" > >> > >> static void i915_ggtt_color_adjust(const struct drm_mm_node *node, > >> unsigned long color, > >> @@ -252,6 +257,145 @@ u64 gen8_ggtt_pte_encode(dma_addr_t addr, > >> return pte; > >> } > >> > >> +static bool should_update_ggtt_with_bind(struct i915_ggtt *ggtt) > >> +{ > >> + struct intel_gt *gt = ggtt->vm.gt; > >> + > >> + return intel_gt_is_bind_context_ready(gt); > >> +} > >> + > >> +static struct intel_context *gen8_ggtt_bind_get_ce(struct i915_ggtt *ggtt) > >> +{ > >> + struct intel_context *ce; > >> + struct intel_gt *gt = ggtt->vm.gt; > >> + > >> + if (intel_gt_is_wedged(gt)) > >> + return NULL; > >> + > >> + ce = gt->engine[BCS0]->bind_context; > >> + GEM_BUG_ON(!ce); > >> + > >> + /* > >> + * If the GT is not awake already at this stage then fallback > >> + * to pci based GGTT update otherwise __intel_wakeref_get_first() > >> + * would conflict with fs_reclaim trying to allocate memory while > >> + * doing rpm_resume(). > >> + */ > >> + if (!intel_gt_pm_get_if_awake(gt)) > >> + return NULL; > >> + > >> + intel_engine_pm_get(ce->engine); > >> + > >> + return ce; > >> +} > >> + > >> +static void gen8_ggtt_bind_put_ce(struct intel_context *ce) > >> +{ > >> + intel_engine_pm_put(ce->engine); > >> + intel_gt_pm_put(ce->engine->gt); > >> +} > >> + > >> +static bool gen8_ggtt_bind_ptes(struct i915_ggtt *ggtt, u32 offset, > >> + struct sg_table *pages, u32 num_entries, > >> + const gen8_pte_t pte) > >> +{ > >> + struct i915_sched_attr attr = {}; > >> + struct intel_gt *gt = ggtt->vm.gt; > >> + const gen8_pte_t scratch_pte = ggtt->vm.scratch[0]->encode; > >> + struct sgt_iter iter; > >> + struct i915_request *rq; > >> + struct intel_context *ce; > >> + u32 *cs; > >> + > >> + if (!num_entries) > >> + return true; > >> + > >> + ce = gen8_ggtt_bind_get_ce(ggtt); > >> + if (!ce) > >> + return false; > >> + > >> + if (pages) > >> + iter = __sgt_iter(pages->sgl, true); > >> + > >> + while (num_entries) { > >> + int count = 0; > >> + dma_addr_t addr; > >> + /* > >> + * MI_UPDATE_GTT can update 512 entries in a single command > >> but > >> + * that end up with engine reset, 511 works. > >> + */ > >> + u32 n_ptes = min_t(u32, 511, num_entries); > >> + > >> + if (mutex_lock_interruptible(&ce->timeline->mutex)) > >> + goto put_ce; > >> + > >> + intel_context_enter(ce); > >> + rq = __i915_request_create(ce, GFP_NOWAIT | GFP_ATOMIC); > >> + intel_context_exit(ce); > >> + if (IS_ERR(rq)) { > >> + GT_TRACE(gt, "Failed to get bind request\n"); > >> + mutex_unlock(&ce->timeline->mutex); > >> + goto put_ce; > >> + } > >> + > >> + cs = intel_ring_begin(rq, 2 * n_ptes + 2); > >> + if (IS_ERR(cs)) { > >> + GT_TRACE(gt, "Failed to ring space for GGTT bind\n"); > >> + i915_request_set_error_once(rq, PTR_ERR(cs)); > >> + /* once a request is created, it must be queued */ > >> + goto queue_err_rq; > >> + } > >> + > >> + *cs++ = MI_UPDATE_GTT | (2 * n_ptes); > >> + *cs++ = offset << 12; > >> + > >> + if (pages) { > >> + for_each_sgt_daddr_next(addr, iter) { > >> + if (count == n_ptes) > >> + break; > >> + *cs++ = lower_32_bits(pte | addr); > >> + *cs++ = upper_32_bits(pte | addr); > >> + count++; > >> + } > >> + /* fill remaining with scratch pte, if any */ > >> + if (count < n_ptes) { > >> + memset64((u64 *)cs, scratch_pte, > >> + n_ptes - count); > >> + cs += (n_ptes - count) * 2; > >> + } > > Should we return error instead of silently fill pte with scratch? Or maybe even > gem_bug_on on this case? The caller didn't allocate enough pages, means > something wrong happened... > > > We do the dame already in gen8_ggtt_insert_entries() so not adding > something new here. The comment just made it obvious :) I don't know > the exact usecase when this happens but > > I saw tons of pipe error without this. Okay it is good to know. I don't see obvious issue of this patch. Acked-by: Oak Zeng <oak.zeng@xxxxxxxxx>. Better someone else also review it. Oak > > > Regards, > > Nirmoy > > > > > Oak > > > >> + } else { > >> + memset64((u64 *)cs, pte, n_ptes); > >> + cs += n_ptes * 2; > >> + } > >> + > >> + intel_ring_advance(rq, cs); > >> +queue_err_rq: > >> + i915_request_get(rq); > >> + __i915_request_commit(rq); > >> + __i915_request_queue(rq, &attr); > >> + > >> + mutex_unlock(&ce->timeline->mutex); > >> + /* This will break if the request is complete or after engine reset > >> */ > >> + i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); > >> + if (rq->fence.error) > >> + goto err_rq; > >> + > >> + i915_request_put(rq); > >> + > >> + num_entries -= n_ptes; > >> + offset += n_ptes; > >> + } > >> + > >> + gen8_ggtt_bind_put_ce(ce); > >> + return true; > >> + > >> +err_rq: > >> + i915_request_put(rq); > >> +put_ce: > >> + gen8_ggtt_bind_put_ce(ce); > >> + return false; > >> +} > >> + > >> static void gen8_set_pte(void __iomem *addr, gen8_pte_t pte) > >> { > >> writeq(pte, addr); > >> @@ -272,6 +416,21 @@ static void gen8_ggtt_insert_page(struct > >> i915_address_space *vm, > >> ggtt->invalidate(ggtt); > >> } > >> > >> +static void gen8_ggtt_insert_page_bind(struct i915_address_space *vm, > >> + dma_addr_t addr, u64 offset, > >> + unsigned int pat_index, u32 flags) > >> +{ > >> + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); > >> + gen8_pte_t pte; > >> + > >> + pte = ggtt->vm.pte_encode(addr, pat_index, flags); > >> + if (should_update_ggtt_with_bind(i915_vm_to_ggtt(vm)) && > >> + gen8_ggtt_bind_ptes(ggtt, offset, NULL, 1, pte)) > >> + return ggtt->invalidate(ggtt); > >> + > >> + gen8_ggtt_insert_page(vm, addr, offset, pat_index, flags); > >> +} > >> + > >> static void gen8_ggtt_insert_entries(struct i915_address_space *vm, > >> struct i915_vma_resource *vma_res, > >> unsigned int pat_index, > >> @@ -311,6 +470,50 @@ static void gen8_ggtt_insert_entries(struct > >> i915_address_space *vm, > >> ggtt->invalidate(ggtt); > >> } > >> > >> +static bool __gen8_ggtt_insert_entries_bind(struct i915_address_space *vm, > >> + struct i915_vma_resource *vma_res, > >> + unsigned int pat_index, u32 flags) > >> +{ > >> + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); > >> + gen8_pte_t scratch_pte = vm->scratch[0]->encode; > >> + gen8_pte_t pte_encode; > >> + u64 start, end; > >> + > >> + pte_encode = ggtt->vm.pte_encode(0, pat_index, flags); > >> + start = (vma_res->start - vma_res->guard) / I915_GTT_PAGE_SIZE; > >> + end = start + vma_res->guard / I915_GTT_PAGE_SIZE; > >> + if (!gen8_ggtt_bind_ptes(ggtt, start, NULL, end - start, scratch_pte)) > >> + goto err; > >> + > >> + start = end; > >> + end += (vma_res->node_size + vma_res->guard) / I915_GTT_PAGE_SIZE; > >> + if (!gen8_ggtt_bind_ptes(ggtt, start, vma_res->bi.pages, > >> + vma_res->node_size / I915_GTT_PAGE_SIZE, pte_encode)) > >> + goto err; > >> + > >> + start += vma_res->node_size / I915_GTT_PAGE_SIZE; > >> + if (!gen8_ggtt_bind_ptes(ggtt, start, NULL, end - start, scratch_pte)) > >> + goto err; > >> + > >> + return true; > >> + > >> +err: > >> + return false; > >> +} > >> + > >> +static void gen8_ggtt_insert_entries_bind(struct i915_address_space *vm, > >> + struct i915_vma_resource *vma_res, > >> + unsigned int pat_index, u32 flags) > >> +{ > >> + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); > >> + > >> + if (should_update_ggtt_with_bind(i915_vm_to_ggtt(vm)) && > >> + __gen8_ggtt_insert_entries_bind(vm, vma_res, pat_index, flags)) > >> + return ggtt->invalidate(ggtt); > >> + > >> + gen8_ggtt_insert_entries(vm, vma_res, pat_index, flags); > >> +} > >> + > >> static void gen8_ggtt_clear_range(struct i915_address_space *vm, > >> u64 start, u64 length) > >> { > >> @@ -332,6 +535,27 @@ static void gen8_ggtt_clear_range(struct > >> i915_address_space *vm, > >> gen8_set_pte(>t_base[i], scratch_pte); > >> } > >> > >> +static void gen8_ggtt_scratch_range_bind(struct i915_address_space *vm, > >> + u64 start, u64 length) > >> +{ > >> + struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm); > >> + unsigned int first_entry = start / I915_GTT_PAGE_SIZE; > >> + unsigned int num_entries = length / I915_GTT_PAGE_SIZE; > >> + const gen8_pte_t scratch_pte = vm->scratch[0]->encode; > >> + const int max_entries = ggtt_total_entries(ggtt) - first_entry; > >> + > >> + if (WARN(num_entries > max_entries, > >> + "First entry = %d; Num entries = %d (max=%d)\n", > >> + first_entry, num_entries, max_entries)) > >> + num_entries = max_entries; > >> + > >> + if (should_update_ggtt_with_bind(ggtt) && gen8_ggtt_bind_ptes(ggtt, > >> first_entry, > >> + NULL, num_entries, scratch_pte)) > >> + return ggtt->invalidate(ggtt); > >> + > >> + gen8_ggtt_clear_range(vm, start, length); > >> +} > >> + > >> static void gen6_ggtt_insert_page(struct i915_address_space *vm, > >> dma_addr_t addr, > >> u64 offset, > >> @@ -997,6 +1221,17 @@ static int gen8_gmch_probe(struct i915_ggtt *ggtt) > >> I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND; > >> } > >> > >> + if (i915_ggtt_require_binder(i915)) { > >> + ggtt->vm.scratch_range = gen8_ggtt_scratch_range_bind; > >> + ggtt->vm.insert_page = gen8_ggtt_insert_page_bind; > >> + ggtt->vm.insert_entries = gen8_ggtt_insert_entries_bind; > >> + /* > >> + * On GPU is hung, we might bind VMAs for error capture. > >> + * Fallback to CPU GGTT updates in that case. > >> + */ > >> + ggtt->vm.raw_insert_page = gen8_ggtt_insert_page; > >> + } > >> + > >> if (intel_uc_wants_guc(&ggtt->vm.gt->uc)) > >> ggtt->invalidate = guc_ggtt_invalidate; > >> else > >> -- > >> 2.41.0