Reviewed-by: Jacek Lawrynowicz <jacek.lawrynowicz@xxxxxxxxxxxxxxx> On 1/7/2025 6:32 PM, Maciej Falkowski wrote: > From: Karol Wachowski <karol.wachowski@xxxxxxxxx> > > Fix deadlock in job submission and abort handling. > When a thread aborts currently executing jobs due to a fault, > it first locks the global lock protecting submitted_jobs (#1). > > After the last job is destroyed, it proceeds to release the related context > and locks file_priv (#2). Meanwhile, in the job submission thread, > the file_priv lock (#2) is taken first, and then the submitted_jobs > lock (#1) is obtained when a job is added to the submitted jobs list. > > CPU0 CPU1 > ---- ---- > (for example due to a fault) (jobs submissions keep coming) > > lock(&vdev->submitted_jobs_lock) #1 > ivpu_jobs_abort_all() > job_destroy() > lock(&file_priv->lock) #2 > lock(&vdev->submitted_jobs_lock) #1 > file_priv_release() > lock(&vdev->context_list_lock) > lock(&file_priv->lock) #2 > > This order of locking causes a deadlock. To resolve this issue, > change the order of locking in ivpu_job_submit(). > > Signed-off-by: Karol Wachowski <karol.wachowski@xxxxxxxxx> > Signed-off-by: Maciej Falkowski <maciej.falkowski@xxxxxxxxxxxxxxx> > --- > drivers/accel/ivpu/ivpu_job.c | 17 +++++++---------- > 1 file changed, 7 insertions(+), 10 deletions(-) > > diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c > index c694822a14bf..c93ea37062d7 100644 > --- a/drivers/accel/ivpu/ivpu_job.c > +++ b/drivers/accel/ivpu/ivpu_job.c > @@ -597,6 +597,7 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) > if (ret < 0) > return ret; > > + mutex_lock(&vdev->submitted_jobs_lock); > mutex_lock(&file_priv->lock); > > if (cmdq_id == 0) > @@ -606,19 +607,17 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) > if (!cmdq) { > ivpu_warn_ratelimited(vdev, "Failed to get job queue, ctx %d\n", file_priv->ctx.id); > ret = -EINVAL; > - goto err_unlock_file_priv; > + goto err_unlock; > } > > ret = ivpu_cmdq_register(file_priv, cmdq); > if (ret) { > ivpu_err(vdev, "Failed to register command queue: %d\n", ret); > - goto err_unlock_file_priv; > + goto err_unlock; > } > > job->cmdq_id = cmdq->id; > > - mutex_lock(&vdev->submitted_jobs_lock); > - > is_first_job = xa_empty(&vdev->submitted_jobs_xa); > ret = xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit, > &file_priv->job_id_next, GFP_KERNEL); > @@ -626,7 +625,7 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) > ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n", > file_priv->ctx.id); > ret = -EBUSY; > - goto err_unlock_submitted_jobs; > + goto err_unlock; > } > > ret = ivpu_cmdq_push_job(cmdq, job); > @@ -649,22 +648,20 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) > job->job_id, file_priv->ctx.id, job->engine_idx, cmdq->priority, > job->cmd_buf_vpu_addr, cmdq->jobq->header.tail); > > - mutex_unlock(&vdev->submitted_jobs_lock); > mutex_unlock(&file_priv->lock); > > if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_HW)) { > - mutex_lock(&vdev->submitted_jobs_lock); > ivpu_job_signal_and_destroy(vdev, job->job_id, VPU_JSM_STATUS_SUCCESS); > - mutex_unlock(&vdev->submitted_jobs_lock); > } > > + mutex_unlock(&vdev->submitted_jobs_lock); > + > return 0; > > err_erase_xa: > xa_erase(&vdev->submitted_jobs_xa, job->job_id); > -err_unlock_submitted_jobs: > +err_unlock: > mutex_unlock(&vdev->submitted_jobs_lock); > -err_unlock_file_priv: > mutex_unlock(&file_priv->lock); > ivpu_rpm_put(vdev); > return ret;