The initial kref from dma_fence_init() should match up with whatever signals the fence, however here we are submitting the job first to the hw and only then grabbing the extra ref and even then we touch some fence state before this. This might be too late if the fence is signalled before we can grab the extra ref. Rather always grab the refcount early before we do the submission part. Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2811 Signed-off-by: Matthew Auld <matthew.auld@xxxxxxxxx> Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Brost <matthew.brost@xxxxxxxxx> Cc: <stable@xxxxxxxxxxxxxxx> # v6.8+ --- drivers/gpu/drm/xe/xe_guc_submit.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index fbbe6a487bbb..b33f3d23a068 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -766,12 +766,15 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job) struct xe_guc *guc = exec_queue_to_guc(q); struct xe_device *xe = guc_to_xe(guc); bool lr = xe_exec_queue_is_lr(q); + struct dma_fence *fence; xe_assert(xe, !(exec_queue_destroyed(q) || exec_queue_pending_disable(q)) || exec_queue_banned(q) || exec_queue_suspended(q)); trace_xe_sched_job_run(job); + dma_fence_get(job->fence); + if (!exec_queue_killed_or_banned_or_wedged(q) && !xe_sched_job_is_error(job)) { if (!exec_queue_registered(q)) register_exec_queue(q); @@ -782,12 +785,16 @@ guc_exec_queue_run_job(struct drm_sched_job *drm_job) if (lr) { xe_sched_job_set_error(job, -EOPNOTSUPP); - return NULL; + fence = NULL; } else if (test_and_set_bit(JOB_FLAG_SUBMIT, &job->fence->flags)) { - return job->fence; + fence = job->fence; } else { - return dma_fence_get(job->fence); + fence = dma_fence_get(job->fence); } + + dma_fence_put(job->fence); + + return fence; } static void guc_exec_queue_free_job(struct drm_sched_job *drm_job) -- 2.46.0