Re: [PATCH 2/7] drm/amdgpu: Add ring reset callback for JPEG4_0_3

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Alex,


On 1/31/2025 10:42 PM, Alex Deucher wrote:
On Fri, Jan 31, 2025 at 11:32 AM Sathishkumar S
<sathishkumar.sundararaju@xxxxxxx> wrote:
Add ring reset function callback for JPEG4_0_3 to
recover from job timeouts without a full gpu reset.

Signed-off-by: Sathishkumar S <sathishkumar.sundararaju@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 60 ++++++++++++++++++++++--
  1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index be0b3b4c8690..62d8628dccc5 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -204,9 +204,7 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
         if (r)
                 return r;

-       /* TODO: Add queue reset mask when FW fully supports it */
-       adev->jpeg.supported_reset =
-               amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]);
+       adev->jpeg.supported_reset = AMDGPU_RESET_TYPE_PER_PIPE;
         r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
         if (r)
                 return r;
@@ -231,6 +229,7 @@ static int jpeg_v4_0_3_sw_fini(struct amdgpu_ip_block *ip_block)
                 return r;

         amdgpu_jpeg_sysfs_reset_mask_fini(adev);
+
         r = amdgpu_jpeg_sw_fini(adev);

         return r;
@@ -1099,6 +1098,60 @@ static int jpeg_v4_0_3_process_interrupt(struct amdgpu_device *adev,
         return 0;
  }

+static int jpeg_v4_0_3_wait_for_idle_on_inst(struct amdgpu_ring *ring)
+{
+       struct amdgpu_device *adev = ring->adev;
+       struct amdgpu_ring *r;
+       int ret, j;
+
+       for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
+               r = &adev->jpeg.inst[ring->me].ring_dec[j];
+               r->sched.ready = false;
I think you want to call drm_sched_wqueue_stop(&r->sched); here
instead to stop further submissions to the other rings.  Note that
drm_sched_wqueue_stop() is already called from amdgpu_job_timedout()
for the queue that kicked this off.  You'll need to start them again
after the reset.
Thank you Alex for the suggestion, I will use this to stop the submissions and start them after reset. This will help save other good jpeg contexts.

Regards,
Sathish


+       }
+       /* publish update */
+       smp_rmb();
+       for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
+               r = &adev->jpeg.inst[ring->me].ring_dec[j];
+               if (r->pipe == j)
+                       continue;
+               ret = SOC15_WAIT_ON_RREG_OFFSET(JPEG, GET_INST(JPEG, ring->me),
+                                               regUVD_JRBC0_UVD_JRBC_STATUS,
+                                               jpeg_v4_0_3_core_reg_offset(j),
+                                               UVD_JRBC0_UVD_JRBC_STATUS__RB_JOB_DONE_MASK,
+                                               UVD_JRBC0_UVD_JRBC_STATUS__RB_JOB_DONE_MASK);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int jpeg_v4_0_3_ring_reset(struct amdgpu_ring *ring, unsigned int vmid)
+{
+       struct amdgpu_device *adev = ring->adev;
+       struct amdgpu_ring *r;
+       int ret, j;
+
+       if (amdgpu_sriov_vf(adev))
+               return -EINVAL;
+
+       jpeg_v4_0_3_wait_for_idle_on_inst(ring);
+       jpeg_v4_0_3_stop_inst(ring->adev, ring->me);
+       jpeg_v4_0_3_start_inst(ring->adev, ring->me);
Is JPEG pipelined or can each engine only process one packet from one
queue at a time?  If it's the latter, then when you reset the engine,
you'll need to save the current rptrs and wptrs from all of the queues
on that engine before you reset it so that you can restore those after
you reset and kick them off again where they left off.  That way we
don't lose any jobs running on other queues.  SDMA has similar
limitations for chips where we can only reset the entire engine.

Alex

+       for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
+               r = &adev->jpeg.inst[ring->me].ring_dec[j];
+               jpeg_v4_0_3_start_jrbc(r);
+               ret = amdgpu_ring_test_helper(r);
+               if (ret)
+                       return ret;
+               r->sched.ready = true;
+       }
+       /* publish update */
+       smp_rmb();
+       dev_info(adev->dev, "Reset on %s succeeded\n", ring->sched.name);
+       return 0;
+}
+
  static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = {
         .name = "jpeg_v4_0_3",
         .early_init = jpeg_v4_0_3_early_init,
@@ -1145,6 +1198,7 @@ static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = {
         .emit_wreg = jpeg_v4_0_3_dec_ring_emit_wreg,
         .emit_reg_wait = jpeg_v4_0_3_dec_ring_emit_reg_wait,
         .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
+       .reset = jpeg_v4_0_3_ring_reset,
  };

  static void jpeg_v4_0_3_set_dec_ring_funcs(struct amdgpu_device *adev)
--
2.25.1





[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux