[Why] Upon GPU reset, kernel cleans up already submitted jobs via drm_sched_cleanup_jobs. This schedules ib's via drm_sched_main()->run_job, leading to race condition of rings being ready or not, since during reset rings may be suspended. [How] make GPU reset's amdgpu_device_ip_resume_phase2() & amdgpu_ib_schedule() in amdgpu_job_run() mutually exclusive. Signed-off-by: Shirish S <shirish.s@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index f4d9041..7b07a47b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -973,6 +973,7 @@ struct amdgpu_device { bool in_gpu_reset; enum pp_mp1_state mp1_state; struct mutex lock_reset; + struct mutex lock_ib_sched; struct amdgpu_doorbell_index doorbell_index; int asic_reset_res; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 676cad1..63cad74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2759,6 +2759,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); mutex_init(&adev->lock_reset); + mutex_init(&adev->lock_ib_sched); mutex_init(&adev->virt.dpm_mutex); mutex_init(&adev->psp.mutex); @@ -3795,7 +3796,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, if (r) return r; + mutex_lock(&tmp_adev->lock_ib_sched); r = amdgpu_device_ip_resume_phase2(tmp_adev); + mutex_unlock(&tmp_adev->lock_ib_sched); if (r) goto out; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index e1bad99..cd6082d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -233,8 +233,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) if (finished->error < 0) { DRM_INFO("Skip scheduling IBs!\n"); } else { + mutex_lock(&ring->adev->lock_ib_sched); r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job, &fence); + mutex_unlock(&ring->adev->lock_ib_sched); if (r) DRM_ERROR("Error scheduling IBs (%d)\n", r); } -- 2.7.4 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx