Re: [PATCH] drm/amdgpu: add missing error handling for amdgpu_ring_alloc()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Error handling is to resolve or at least mitigate problems caused by the user or the hardware.

When the driver tries to allocate more than allowed from the ring then that's a driver bug and not something for error handling.

So the error handling here doesn't seem to make sense in the first place.

Regards,
Christian.

Am 03.07.24 um 09:23 schrieb Zhou, Bob:
[AMD Official Use Only - AMD Internal Distribution Only]

Hi Christian,

In making the v2 version patch process, I check lots of cases for amdgpu_ring_alloc(). I think checking the return value maybe better.
For allocating submission size from amdgpu_ring_alloc(), the action is allowed to fail so that rerun allocating submission rather than drop the return value.
For many cases in gpu driver, it still needs check the call result to conduct the error handling. If drop the return value, in those case it causes some unexpected result.
If the error occurs, the command submission should be skip and wait the next action.

Regards,
Bob

-----Original Message-----
From: Koenig, Christian <Christian.Koenig@xxxxxxx>
Sent: 2024年6月21日 17:28
To: Zhou, Bob <Bob.Zhou@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Huang, Tim <Tim.Huang@xxxxxxx>; Deucher, Alexander <Alexander.Deucher@xxxxxxx>
Subject: Re: [PATCH] drm/amdgpu: add missing error handling for amdgpu_ring_alloc()

Am 21.06.24 um 11:24 schrieb Bob Zhou:
Fix the unchecked return value warning reported by Coverity, so add
error handling.
That seems to be completely superfluous. The only case when
amdgpu_ring_alloc() returns an error is when we try to allocate more than the maximum submission size.

And in that case we will get quite a warning already.

I strongly suggest to instead drop the return value from amdgpu_ring_alloc().

Regards,
Christian.

Signed-off-by: Bob Zhou <bob.zhou@xxxxxxx>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c      | 7 +++++--
   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c      | 4 +++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c | 6 ++++--
   drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c      | 3 ++-
   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c        | 8 ++++++--
   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c       | 6 +++++-
   drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c       | 6 +++++-
   drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c       | 6 +++++-
   8 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 82452606ae6c..25cab6a8d478 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1005,7 +1005,8 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg, uint32_t xcc_
               pr_err("critical bug! too many kiq readers\n");
               goto failed_unlock;
       }
-     amdgpu_ring_alloc(ring, 32);
+     if (amdgpu_ring_alloc(ring, 32))
+             goto failed_unlock;
       amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
       r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
       if (r)
@@ -1071,7 +1072,8 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint3
       }

       spin_lock_irqsave(&kiq->ring_lock, flags);
-     amdgpu_ring_alloc(ring, 32);
+     if (amdgpu_ring_alloc(ring, 32))
+             goto failed_unlock;
       amdgpu_ring_emit_wreg(ring, reg, v);
       r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
       if (r)
@@ -1107,6 +1109,7 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev,
uint32_t reg, uint32_t v, uint3

   failed_undo:
       amdgpu_ring_undo(ring);
+failed_unlock:
       spin_unlock_irqrestore(&kiq->ring_lock, flags);
   failed_kiq_write:
       dev_err(adev->dev, "failed to write reg:%x\n", reg); diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 3a7622611916..778941f47c96 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -768,7 +768,8 @@ void amdgpu_gmc_fw_reg_write_reg_wait(struct amdgpu_device *adev,
       }

       spin_lock_irqsave(&kiq->ring_lock, flags);
-     amdgpu_ring_alloc(ring, 32);
+     if (amdgpu_ring_alloc(ring, 32))
+             goto failed_unlock;
       amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
                                           ref, mask);
       r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT); @@
-798,6 +799,7 @@ void amdgpu_gmc_fw_reg_write_reg_wait(struct
amdgpu_device *adev,

   failed_undo:
       amdgpu_ring_undo(ring);
+failed_unlock:
       spin_unlock_irqrestore(&kiq->ring_lock, flags);
   failed_kiq:
       dev_err(adev->dev, "failed to write reg %x wait reg %x\n", reg0,
reg1); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
index d234b7ccfaaf..01864990a192 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring_mux.c
@@ -63,12 +63,14 @@ static void amdgpu_ring_mux_copy_pkt_from_sw_ring(struct amdgpu_ring_mux *mux,
               return;
       }
       if (start > end) {
-             amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start);
+             if (amdgpu_ring_alloc(real_ring, (ring->ring_size >> 2) + end - start))
+                     return;
               amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start],
                                          (ring->ring_size >> 2) - start);
               amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[0], end);
       } else {
-             amdgpu_ring_alloc(real_ring, end - start);
+             if (amdgpu_ring_alloc(real_ring, end - start))
+                     return;
               amdgpu_ring_write_multiple(real_ring, (void *)&ring->ring[start], end - start);
       }
   }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index bad232859972..d7d68e61506d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -610,7 +610,8 @@ static int vpe_ring_preempt_ib(struct amdgpu_ring
*ring)

       /* emit the trailing fence */
       ring->trail_seq += 1;
-     amdgpu_ring_alloc(ring, 10);
+     if (amdgpu_ring_alloc(ring, 10))
+             return -ENOMEM;
       vpe_ring_emit_fence(ring, ring->trail_fence_gpu_addr, ring->trail_seq, 0);
       amdgpu_ring_commit(ring);

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 2929c8972ea7..810f7f7e279d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4093,7 +4093,8 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev)
               pr_err("critical bug! too many kiq readers\n");
               goto failed_unlock;
       }
-     amdgpu_ring_alloc(ring, 32);
+     if (amdgpu_ring_alloc(ring, 32))
+             goto failed_unlock;
       amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
       amdgpu_ring_write(ring, 9 |     /* src: register*/
                               (5 << 8) |      /* dst: memory */
@@ -5639,7 +5640,10 @@ static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
       amdgpu_ring_set_preempt_cond_exec(ring, false);

       ring->trail_seq += 1;
-     amdgpu_ring_alloc(ring, 13);
+     if (amdgpu_ring_alloc(ring, 13)) {
+             spin_unlock_irqrestore(&kiq->ring_lock, flags);
+             return -ENOMEM;
+     }
       gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
                                ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC |
AMDGPU_FENCE_FLAG_INT);

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b7d33d78bce0..07ca9264085b 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1490,7 +1490,11 @@ static int sdma_v5_0_ring_preempt_ib(struct
amdgpu_ring *ring)

       /* emit the trailing fence */
       ring->trail_seq += 1;
-     amdgpu_ring_alloc(ring, 10);
+     r = amdgpu_ring_alloc(ring, 10);
+     if (r) {
+             DRM_ERROR("ring %d failed to be allocated\n", ring->idx);
+             return r;
+     }
       sdma_v5_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
                                 ring->trail_seq, 0);
       amdgpu_ring_commit(ring);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index cc9e961f0078..d51d5bd7de30 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -1345,7 +1345,11 @@ static int sdma_v5_2_ring_preempt_ib(struct
amdgpu_ring *ring)

       /* emit the trailing fence */
       ring->trail_seq += 1;
-     amdgpu_ring_alloc(ring, 10);
+     r = amdgpu_ring_alloc(ring, 10);
+     if (r) {
+             DRM_ERROR("ring %d failed to be allocated\n", ring->idx);
+             return r;
+     }
       sdma_v5_2_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
                                 ring->trail_seq, 0);
       amdgpu_ring_commit(ring);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index c833b6b8373b..e82426459cc7 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -1371,7 +1371,11 @@ static int sdma_v6_0_ring_preempt_ib(struct
amdgpu_ring *ring)

       /* emit the trailing fence */
       ring->trail_seq += 1;
-     amdgpu_ring_alloc(ring, 10);
+     r = amdgpu_ring_alloc(ring, 10);
+     if (r) {
+             DRM_ERROR("ring %d failed to be allocated\n", ring->idx);
+             return r;
+     }
       sdma_v6_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
                                 ring->trail_seq, 0);
       amdgpu_ring_commit(ring);




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux