Re: [RFC PATCH 1/1] drm/amdgpu: wait for sched to become ready on job submit

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I think that the IB tests already set the ready flag to false when something goes wrong, but for the ring tests your probably want to double check and maybe generalize that.

Christian.

Am 24.02.20 um 19:15 schrieb Das, Nirmoy:

[AMD Official Use Only - Internal Distribution Only]


Thanks Christian. I will try to send a updated patch soon. 


From: Koenig, Christian <Christian.Koenig@xxxxxxx>
Sent: Monday, February 24, 2020, 18:06
To: Nirmoy Das
Cc: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander; Liu, Monk; Li, Dennis; Das, Nirmoy
Subject: Re: [RFC PATCH 1/1] drm/amdgpu: wait for sched to become ready on job submit

Hi Nirmoy,

Am 24.02.2020 17:48 schrieb Nirmoy Das <nirmoy.aiemd@xxxxxxxxx>:
On reset, amdgpu can set a drm sched's ready status to false temporarily. drm job
init will fail if all of the drm scheds are not ready for a HW IP. This patch tries to make
kernel's internal drm job submit handle, amdgpu_job_submit() a bit more fault tolerant.

I don't think that this approach makes sense. Since it is a front end property we should rather stop setting the scheduler ready status to false during reset.

Instead we should only set it to false when the ring/IB test fails and we can't bring the ring back to life again.

Christian.


Signed-off-by: Nirmoy Das <nirmoy.das@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 35 +++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |  5 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     |  6 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c     |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c     |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c      |  2 +-
 7 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index d42be880a236..0745df80112f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -139,7 +139,38 @@ void amdgpu_job_free(struct amdgpu_job *job)
         kfree(job);
 }
 
-int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
+static int amdgpu_job_try_init(struct amdgpu_device *adev,
+                              struct drm_sched_job *base,
+                              struct drm_sched_entity *entity,
+                              void *owner)
+{
+       int r, i;
+
+       r = drm_sched_job_init(base, entity, owner);
+       if (r == -ENOENT) {
+               /* retry till we come out of reset phase */
+               while (!mutex_trylock(&adev->lock_reset))
+                       msleep(10);
+               /* retry for a second for the sched to get ready*/
+               for (i = 0; i < 100; i++) {
+                       msleep(10);
+                       r = drm_sched_job_init(base, entity, owner);
+                       if (r == -ENOENT)
+                               continue;
+               }
+
+               mutex_unlock(&adev->lock_reset);
+               /* If after all these we failed to initialize a job
+                * it means the IP is unrecoverable */
+               if (r == -ENOENT)
+                       return -ENODEV;
+       }
+
+       return r;
+}
+
+int amdgpu_job_submit(struct amdgpu_device *adev,struct amdgpu_job *job,
+                     struct drm_sched_entity *entity,
                       void *owner, struct dma_fence **f)
 {
         enum drm_sched_priority priority;
@@ -149,7 +180,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
         if (!f)
                 return -EINVAL;
 
-       r = drm_sched_job_init(&job->base, entity, owner);
+       r = amdgpu_job_try_init(adev, &job->base, entity, owner);
         if (r)
                 return r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 2e2110dddb76..fed87e96cacc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -70,8 +70,9 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
 
 void amdgpu_job_free_resources(struct amdgpu_job *job);
 void amdgpu_job_free(struct amdgpu_job *job);
-int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
-                     void *owner, struct dma_fence **f);
+int amdgpu_job_submit(struct amdgpu_device *adev, struct amdgpu_job *job,
+                     struct drm_sched_entity *entity, void *owner,
+                     struct dma_fence **f);
 int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
                              struct dma_fence **fence);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 660867cf2597..adfde07eb75f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -2066,7 +2066,7 @@ static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
         if (r)
                 goto error_free;
 
-       r = amdgpu_job_submit(job, &adev->mman.entity,
+       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
                               AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
         if (r)
                 goto error_free;
@@ -2137,7 +2137,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
         if (direct_submit)
                 r = amdgpu_job_submit_direct(job, ring, fence);
         else
-               r = amdgpu_job_submit(job, &adev->mman.entity,
+               r = amdgpu_job_submit(adev, job, &adev->mman.entity,
                                       AMDGPU_FENCE_OWNER_UNDEFINED, fence);
         if (r)
                 goto error_free;
@@ -2231,7 +2231,7 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,
 
         amdgpu_ring_pad_ib(ring, &job->ibs[0]);
         WARN_ON(job->ibs[0].length_dw > num_dw);
-       r = amdgpu_job_submit(job, &adev->mman.entity,
+       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
                               AMDGPU_FENCE_OWNER_UNDEFINED, fence);
         if (r)
                 goto error_free;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index 5fd32ad1c575..8ff97b24914e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -1104,7 +1104,7 @@ static int amdgpu_uvd_send_msg(struct amdgpu_ring *ring, struct amdgpu_bo *bo,
                 if (r)
                         goto err_free;
 
-               r = amdgpu_job_submit(job, &adev->uvd.entity,
+               r = amdgpu_job_submit(adev, job, &adev->uvd.entity,
                                       AMDGPU_FENCE_OWNER_UNDEFINED, &f);
                 if (r)
                         goto err_free;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index 59ddba137946..e721d3367783 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -554,7 +554,7 @@ static int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle,
         if (direct)
                 r = amdgpu_job_submit_direct(job, ring, &f);
         else
-               r = amdgpu_job_submit(job, &ring->adev->vce.entity,
+               r = amdgpu_job_submit(ring->adev, job, &ring->adev->vce.entity,
                                       AMDGPU_FENCE_OWNER_UNDEFINED, &f);
         if (r)
                 goto err;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 4cc7881f438c..b536962c22d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -100,7 +100,7 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,
         WARN_ON(ib->length_dw == 0);
         amdgpu_ring_pad_ib(ring, ib);
         WARN_ON(ib->length_dw > p->num_dw_left);
-       r = amdgpu_job_submit(p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);
+       r = amdgpu_job_submit(p->adev, p->job, entity, AMDGPU_FENCE_OWNER_VM, &f);
         if (r)
                 goto error;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 9775eca6fe43..a4aaa2a1f878 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -377,7 +377,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
         job->vm_needs_flush = true;
         job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop;
         amdgpu_ring_pad_ib(ring, &job->ibs[0]);
-       r = amdgpu_job_submit(job, &adev->mman.entity,
+       r = amdgpu_job_submit(adev, job, &adev->mman.entity,
                               AMDGPU_FENCE_OWNER_UNDEFINED, &fence);
         if (r)
                 goto error_submit;
--
2.25.0




_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux