6.8-stable review patch. If anyone has any objections, please let me know. ------------------ From: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> commit 3c88b8f471ee9512bc4ef02bebafdc53fb7c5d9e upstream. For each rebind we insert a GuC TLB invalidation and add a corresponding unordered TLB invalidation fence. This might add a huge number of TLB invalidation fences to wait for so rather than doing that, defer the TLB invalidation to the next ring ops for each affected exec queue. Since the TLB is invalidated on exec_queue switch, we need to invalidate once for each affected exec_queue. v2: - Simplify if-statements around the tlb_flush_seqno. (Matthew Brost) - Add some comments and asserts. Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost <matthew.brost@xxxxxxxxx> Cc: <stable@xxxxxxxxxxxxxxx> # v6.8+ Signed-off-by: Thomas Hellström <thomas.hellstrom@xxxxxxxxxxxxxxx> Reviewed-by: Matthew Brost <matthew.brost@xxxxxxxxx> Link: https://patchwork.freedesktop.org/patch/msgid/20240327091136.3271-2-thomas.hellstrom@xxxxxxxxxxxxxxx (cherry picked from commit 4fc4899e86f7afbd09f4bcb899f0fc57e0296e62) Signed-off-by: Lucas De Marchi <lucas.demarchi@xxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/xe/xe_exec_queue_types.h | 5 +++++ drivers/gpu/drm/xe/xe_pt.c | 6 ++++-- drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++------- drivers/gpu/drm/xe/xe_sched_job.c | 10 ++++++++++ drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++ 6 files changed, 30 insertions(+), 9 deletions(-) --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -159,6 +159,11 @@ struct xe_exec_queue { const struct xe_ring_ops *ring_ops; /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ struct drm_sched_entity *entity; + /** + * @tlb_flush_seqno: The seqno of the last rebind tlb flush performed + * Protected by @vm's resv. Unused if @vm == NULL. + */ + u64 tlb_flush_seqno; /** @lrc: logical ring context for this exec queue */ struct xe_lrc lrc[]; }; --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1255,11 +1255,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, s * non-faulting LR, in particular on user-space batch buffer chaining, * it needs to be done here. */ - if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) || - (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { + if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM); + } else if (rebind && !xe_vm_in_lr_mode(vm)) { + /* We bump also if batch_invalidate_tlb is true */ + vm->tlb_flush_seqno++; } rfence = kzalloc(sizeof(*rfence), GFP_KERNEL); --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -227,10 +227,9 @@ static void __emit_job_gen12_simple(stru { u32 dw[MAX_JOB_SIZE_DW], i = 0; u32 ppgtt_flag = get_ppgtt_flag(job); - struct xe_vm *vm = job->q->vm; struct xe_gt *gt = job->q->gt; - if (vm && vm->batch_invalidate_tlb) { + if (job->ring_ops_flush_tlb) { dw[i++] = preparser_disable(true); i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); @@ -278,7 +277,6 @@ static void __emit_job_gen12_video(struc struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE; - struct xe_vm *vm = job->q->vm; dw[i++] = preparser_disable(true); @@ -290,13 +288,13 @@ static void __emit_job_gen12_video(struc i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); } - if (vm && vm->batch_invalidate_tlb) + if (job->ring_ops_flush_tlb) i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); dw[i++] = preparser_disable(false); - if (!vm || !vm->batch_invalidate_tlb) + if (!job->ring_ops_flush_tlb) i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, dw, i); @@ -325,7 +323,6 @@ static void __emit_job_gen12_render_comp struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); - struct xe_vm *vm = job->q->vm; u32 mask_flags = 0; dw[i++] = preparser_disable(true); @@ -335,7 +332,7 @@ static void __emit_job_gen12_render_comp mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS; /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */ - i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i); + i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i); /* hsdes: 1809175790 */ if (has_aux_ccs(xe)) --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -250,6 +250,16 @@ bool xe_sched_job_completed(struct xe_sc void xe_sched_job_arm(struct xe_sched_job *job) { + struct xe_exec_queue *q = job->q; + struct xe_vm *vm = q->vm; + + if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) && + (vm->batch_invalidate_tlb || vm->tlb_flush_seqno != q->tlb_flush_seqno)) { + xe_vm_assert_held(vm); + q->tlb_flush_seqno = vm->tlb_flush_seqno; + job->ring_ops_flush_tlb = true; + } + drm_sched_job_arm(&job->drm); } --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -39,6 +39,8 @@ struct xe_sched_job { } user_fence; /** @migrate_flush_flags: Additional flush flags for migration jobs */ u32 migrate_flush_flags; + /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */ + bool ring_ops_flush_tlb; /** @batch_addr: batch buffer address of job */ u64 batch_addr[]; }; --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -282,6 +282,11 @@ struct xe_vm { bool capture_once; } error_capture; + /** + * @tlb_flush_seqno: Required TLB flush seqno for the next exec. + * protected by the vm resv. + */ + u64 tlb_flush_seqno; /** @batch_invalidate_tlb: Always invalidate TLB before batch start */ bool batch_invalidate_tlb; /** @xef: XE file handle for tracking this VM's drm client */