On reset, amdgpu can set a drm sched's ready status to false temporarily. drm job init will fail if all of the drm scheds are not ready for a HW IP. This patch tries to make kernel's internal drm job submit handle, amdgpu_job_submit() a bit more fault tolerant. Signed-off-by: Nirmoy Das <nirmoy.das@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 35 +++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 5 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +- 7 files changed, 43 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index d42be880a236..0745df80112f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -139,7 +139,38 @@ void amdgpu_job_free(struct amdgpu_job *job) kfree(job); } -int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, +static int amdgpu_job_try_init(struct amdgpu_device *adev, + struct drm_sched_job *base, + struct drm_sched_entity *entity, + void *owner) +{ + int r, i; + + r = drm_sched_job_init(base, entity, owner); + if (r == -ENOENT) { + /* retry till we come out of reset phase */ + while (!mutex_trylock(&adev->lock_reset)) + msleep(10); + /* retry for a second for the sched to get ready*/ + for (i = 0; i < 100; i++) { + msleep(10); + r = drm_sched_job_init(base, entity, owner); + if (r == -ENOENT) + continue; + } + + mutex_unlock(&adev->lock_reset); + /* If after all these we failed to initialize a job + * it means the IP is unrecoverable */ + if (r == -ENOENT) + return -ENODEV; + } + + return r; +} + +int amdgpu_job_submit(struct amdgpu_device *adev,struct amdgpu_job *job, + struct drm_sched_entity *entity, void *owner, struct dma_fence **f) { enum drm_sched_priority priority; @@ -149,7 +180,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, if (!f) return -EINVAL; - r = drm_sched_job_init(&job->base, entity, owner); + r = amdgpu_job_try_init(adev, &job->base, entity, owner); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 2e2110dddb76..fed87e96cacc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -70,8 +70,9 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, void amdgpu_job_free_resources(struct amdgpu_job *job); void amdgpu_job_free(struct amdgpu_job *job); -int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, - void *owner, struct dma_fence **f); +int amdgpu_job_submit(struct amdgpu_device *adev, struct amdgpu_job *job, + struct drm_sched_entity *entity, void *owner, + struct dma_fence **f); int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, struct dma_fence **fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 660867cf2597..adfde07eb75f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -2066,7 +2066,7 @@ static int amdgpu_map_buffer(struct ttm_buffer_object *bo, if (r) goto error_free; - r = amdgpu_job_submit(job, &adev->mman.entity, + r = amdgpu_job_submit(adev, job, &adev->mman.entity, AMDGPU_FENCE_OWNER_UNDEFINED, &fence); if (r) goto error_free; @@ -2137,7 +2137,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset, if (direct_submit) r = amdgpu_job_submit_direct(job, ring, fence); else - r = amdgpu_job_submit(job, &adev->mman.entity, + r = amdgpu_job_submit(adev, job, &adev->mman.entity, AMDGPU_FENCE_OWNER_UNDEFINED, fence); if (r) goto error_free; @@ -2231,7 +2231,7 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo, amdgpu_ring_pad_ib(ring, &job->ibs[0]); WARN_ON(job->ibs[0].length_dw > num_dw); - r = amdgpu_job_submit(job, &adev->mman.entity, + r = amdgpu_job_submit(adev, job, &adev->mman.entity, AMDGPU_FENCE_OWNER_UNDEFINED, fence); if (r) goto error_free; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 5fd32ad1c575..8ff97b24914e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -1104,7 +1104,7 @@ static int amdgpu_uvd_send_msg(struct amdgpu_ring *ring, struct amdgpu_bo *bo, if (r) goto err_free; - r = amdgpu_job_submit(job, &adev->uvd.entity, + r = amdgpu_job_submit(adev, job, &adev->uvd.entity, AMDGPU_FENCE_OWNER_UNDEFINED, &f); if (r) goto err_free; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 59ddba137946..e721d3367783 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -554,7 +554,7 @@ static int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle, if (direct) r = amdgpu_job_submit_direct(job, ring, &f); else - r = amdgpu_job_submit(job, &ring->adev->vce.entity, + r = amdgpu_job_submit(ring->adev, job, &ring->adev->vce.entity, AMDGPU_FENCE_OWNER_UNDEFINED, &f); if (r) goto err; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 4cc7881f438c..b536962c22d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -100,7 +100,7 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p, WARN_ON(ib->length_dw == 0); amdgpu_ring_pad_ib(ring, ib); WARN_ON(ib->length_dw > p->num_dw_left); - r = amdgpu_job_submit(p->job, entity, AMDGPU_FENCE_OWNER_VM, &f); + r = amdgpu_job_submit(p->adev, p->job, entity, AMDGPU_FENCE_OWNER_VM, &f); if (r) goto error; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 9775eca6fe43..a4aaa2a1f878 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -377,7 +377,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, job->vm_needs_flush = true; job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop; amdgpu_ring_pad_ib(ring, &job->ibs[0]); - r = amdgpu_job_submit(job, &adev->mman.entity, + r = amdgpu_job_submit(adev, job, &adev->mman.entity, AMDGPU_FENCE_OWNER_UNDEFINED, &fence); if (r) goto error_submit; -- 2.25.0 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx