On 2/18/2025 8:36 AM, Sathishkumar S wrote: > Add ring reset function callback for JPEG4_0_3 to > recover from job timeouts without a full gpu reset. > > V2: > - sched->ready flag shouldn't be modified by HW backend (Christian) > > V3: > - Dont modifying sched/job-submission state from HW backend (Christian) > - Implement per-core reset sequence > > Signed-off-by: Sathishkumar S <sathishkumar.sundararaju@xxxxxxx> > Acked-by: Christian König <christian.koenig@xxxxxxx> > Reviewed-by: Leo Liu <leo.liu@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 49 +++++++++++++++++++++--- > 1 file changed, 43 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c > index c67ba961de91..f10231c22c15 100644 > --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c > @@ -204,14 +204,10 @@ static int jpeg_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) > if (r) > return r; > > - /* TODO: Add queue reset mask when FW fully supports it */ > - adev->jpeg.supported_reset = > - amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]); > + adev->jpeg.supported_reset = AMDGPU_RESET_TYPE_PER_QUEUE; > r = amdgpu_jpeg_sysfs_reset_mask_init(adev); > - if (r) > - return r; > > - return 0; > + return r; > } > > /** > @@ -231,6 +227,7 @@ static int jpeg_v4_0_3_sw_fini(struct amdgpu_ip_block *ip_block) > return r; > > amdgpu_jpeg_sysfs_reset_mask_fini(adev); > + > r = amdgpu_jpeg_sw_fini(adev); > > return r; > @@ -1099,6 +1096,45 @@ static int jpeg_v4_0_3_process_interrupt(struct amdgpu_device *adev, > return 0; > } > > +static void jpeg_v4_0_3_core_stall_reset(struct amdgpu_ring *ring) > +{ > + struct amdgpu_device *adev = ring->adev; > + int jpeg_inst = GET_INST(JPEG, ring->me); > + int reg_offset = jpeg_v4_0_3_core_reg_offset(ring->pipe); > + > + WREG32_SOC15_OFFSET(JPEG, jpeg_inst, > + regUVD_JMI0_UVD_JMI_CLIENT_STALL, > + reg_offset, 0x1F); > + SOC15_WAIT_ON_RREG(JPEG, jpeg_inst, > + regUVD_JMI0_UVD_JMI_CLIENT_CLEAN_STATUS, > + 0x1F, 0x1f); > + WREG32_SOC15_OFFSET(JPEG, jpeg_inst, > + regUVD_JMI0_JPEG_LMI_DROP, > + reg_offset, 0x1F); > + WREG32_SOC15_OFFSET(JPEG, jpeg_inst, > + regJPEG_CORE_RST_CTRL, > + reg_offset, 1 << ring->pipe); > + WREG32_SOC15_OFFSET(JPEG, jpeg_inst, > + regUVD_JMI0_UVD_JMI_CLIENT_STALL, > + reg_offset, 0x00); > + WREG32_SOC15_OFFSET(JPEG, jpeg_inst, > + regUVD_JMI0_JPEG_LMI_DROP, > + reg_offset, 0x00); > + WREG32_SOC15_OFFSET(JPEG, jpeg_inst, > + regJPEG_CORE_RST_CTRL, > + reg_offset, 0x00); > +} > + > +static int jpeg_v4_0_3_ring_reset(struct amdgpu_ring *ring, unsigned int vmid) > +{ > + if (amdgpu_sriov_vf(ring->adev)) > + return -EINVAL; -EOPNOTSUPP could be more appropriate. Since it's not supported on VF, the same check may be applied while initializing adev->jpeg.supported_reset mask also so that sysfs options are not created for VFs. Thanks, Lijo > + > + jpeg_v4_0_3_core_stall_reset(ring); > + jpeg_v4_0_3_start_jrbc(ring); > + return amdgpu_ring_test_helper(ring); > +} > + > static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = { > .name = "jpeg_v4_0_3", > .early_init = jpeg_v4_0_3_early_init, > @@ -1145,6 +1181,7 @@ static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = { > .emit_wreg = jpeg_v4_0_3_dec_ring_emit_wreg, > .emit_reg_wait = jpeg_v4_0_3_dec_ring_emit_reg_wait, > .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, > + .reset = jpeg_v4_0_3_ring_reset, > }; > > static void jpeg_v4_0_3_set_dec_ring_funcs(struct amdgpu_device *adev)