On 2/10/2025 3:26 PM, Jesse.zhang@xxxxxxx wrote: > From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx> > > This commit introduces several improvements to the SDMA reset logic: > > 1. Added `cached_rptr` to the `amdgpu_ring` structure to store the read pointer > before a reset, ensuring proper state restoration after reset. > > 2. Introduced `gfx_guilty` and `page_guilty` flags in the `amdgpu_sdma` structure > to track which queue (GFX or PAGE) caused a timeout or error. > > 3. Replaced the `caller` parameter with a `guilty` boolean in the reset and resume > functions to simplify the logic and handle resets based on the guilty state. > > 4. Added a helper function `sdma_v4_4_2_is_queue_selected` to check the > `SDMA*_*_CONTEXT_STATUS.SELECTED` register and determine if a queue is guilty. > > v2: > 1.replace the caller with a guilty bool. > If the queue is the guilty one, set the rptr and wptr to the saved wptr value, > else, set the rptr and wptr to the saved rptr value. (Alex) > 2. cache the rptr before the reset. (Alex) > > v3: add a new ring callback, is_guilty(), which will get called to check if > the ring in amdgpu_job_timedout() is actually the guilty ring. If it's not, > we can return goto exit(Alex) > > v4: cache the rptr for page ring > > v5: update the register addresses to correctly use the page ring registers > (regSDMA_PAGE_RB_RPTR) in page resume. > > Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx> > Suggested-by: Jiadong Zhu <Jiadong.Zhu@xxxxxxx> > Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 2 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 + > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 6 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 3 + > drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 96 ++++++++++++++++++++---- > 6 files changed, 106 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 100f04475943..ce3e7a9d6688 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -102,6 +102,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > return DRM_GPU_SCHED_STAT_ENODEV; > } > > + /* Check if the ring is actually guilty of causing the timeout. > + * If not, skip error handling and fence completion. > + */ > + if (amdgpu_gpu_recovery && ring->funcs->is_guilty) { > + if (!ring->funcs->is_guilty(ring)) { > + dev_err(adev->dev, "ring %s timeout, but not guilty\n", > + s_job->sched->name); > + goto exit; > + } > + } > /* > * Do the coredump immediately after a job timeout to get a very > * close dump/snapshot/representation of GPU's current error status > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > index a6e28fe3f8d6..20cd21df38ba 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > @@ -342,6 +342,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, > ring->buf_mask = (ring->ring_size / 4) - 1; > ring->ptr_mask = ring->funcs->support_64bit_ptrs ? > 0xffffffffffffffff : ring->buf_mask; > + /* Initialize cached_rptr to 0 */ > + ring->cached_rptr = 0; > > /* Allocate ring buffer */ > if (ring->is_mes_queue) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 04af26536f97..182aa535d395 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -237,6 +237,7 @@ struct amdgpu_ring_funcs { > void (*patch_de)(struct amdgpu_ring *ring, unsigned offset); > int (*reset)(struct amdgpu_ring *ring, unsigned int vmid); > void (*emit_cleaner_shader)(struct amdgpu_ring *ring); > + bool (*is_guilty)(struct amdgpu_ring *ring); > }; > > struct amdgpu_ring { > @@ -306,6 +307,8 @@ struct amdgpu_ring { > > bool is_sw_ring; > unsigned int entry_index; > + /* store the cached rptr to restore after reset */ > + uint64_t cached_rptr; > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > index 8864a9d7455b..02d3685d10fc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > @@ -474,6 +474,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct > if (!funcs) > return; > > + /* Ensure the reset_callback_list is initialized */ > + if (!adev->sdma.reset_callback_list.next) { > + INIT_LIST_HEAD(&adev->sdma.reset_callback_list); > + } Keeping it in sw_init looks just fine. > /* Initialize the list node in the callback structure */ > INIT_LIST_HEAD(&funcs->list); > > @@ -513,7 +517,7 @@ int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t instance_id, > */ > if (!amdgpu_ring_sched_ready(gfx_ring)) { > drm_sched_wqueue_stop(&gfx_ring->sched); > - gfx_sched_stopped = true;; > + gfx_sched_stopped = true; > } > > if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > index df5c9f7a4374..d84c3eccc510 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > @@ -126,6 +126,9 @@ struct amdgpu_sdma { > uint32_t *ip_dump; > uint32_t supported_reset; > struct list_head reset_callback_list; > + /* track guilty state of GFX and PAGE queues */ > + bool gfx_guilty; > + bool page_guilty; > }; > > /* > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > index c297b4a13680..ad30077ade6f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > @@ -671,11 +671,12 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl) > * @adev: amdgpu_device pointer > * @i: instance to resume > * @restore: used to restore wptr when restart > + * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error) > * > * Set up the gfx DMA ring buffers and enable them. > * Returns 0 for success, error for failure. > */ > -static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore) > +static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty) > { > struct amdgpu_ring *ring = &adev->sdma.instance[i].ring; > u32 rb_cntl, ib_cntl, wptr_poll_cntl; > @@ -710,10 +711,19 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b > > /* Initialize the ring buffer's read and write pointers */ > if (restore) { > - WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2)); > - WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2)); > - WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2)); > - WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2)); > + if (guilty) { Keeping an intermediate variable like u64 rwptr could simplify. if (guilty) rwptr = ring->wptr; else rwptr = ring->cached_rptr; > + /* for the guilty queue, set RPTR to the current wptr to skip bad commands */ > + WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2)); > + WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2)); > + WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2)); > + WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2)); > + } else { > + /* not the guilty queue, restore the cache_rptr to continue execution */ > + WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->cached_rptr << 2)); > + WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->cached_rptr << 2)); > + WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->cached_rptr << 2)); > + WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->cached_rptr << 2)); > + } > } else { > WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, 0); > WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, 0); > @@ -768,11 +778,12 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b > * @adev: amdgpu_device pointer > * @i: instance to resume > * @restore: boolean to say restore needed or not > + * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error) > * > * Set up the page DMA ring buffers and enable them. > * Returns 0 for success, error for failure. > */ > -static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore) > +static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty) > { > struct amdgpu_ring *ring = &adev->sdma.instance[i].page; > u32 rb_cntl, ib_cntl, wptr_poll_cntl; > @@ -789,10 +800,19 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, > > /* Initialize the ring buffer's read and write pointers */ > if (restore) { > - WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2)); > - WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2)); > - WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2)); > - WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2)); > + if (guilty) {\ Same comment as above. Shouldn't the guilty state be reset post queue reset? Thanks, Lijo > + /* for the guilty queue, set RPTR to the current wptr to skip bad commands */ > + WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, lower_32_bits(ring->wptr << 2)); > + WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, upper_32_bits(ring->wptr << 2)); > + WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR, lower_32_bits(ring->wptr << 2)); > + WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR_HI, upper_32_bits(ring->wptr << 2)); > + } else { > + /* not the guilty queue, restore the cached_rptr to continue execution */ > + WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, lower_32_bits(ring->cached_rptr << 2)); > + WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, upper_32_bits(ring->cached_rptr << 2)); > + WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR, lower_32_bits(ring->cached_rptr << 2)); > + WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR_HI, upper_32_bits(ring->cached_rptr << 2)); > + } > } else { > WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, 0); > WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, 0); > @@ -968,9 +988,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev, > uint32_t temp; > > WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0); > - sdma_v4_4_2_gfx_resume(adev, i, restore); > + sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty); > if (adev->sdma.has_page_queue) > - sdma_v4_4_2_page_resume(adev, i, restore); > + sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty); > > /* set utc l1 enable flag always to 1 */ > temp = RREG32_SDMA(i, regSDMA_CNTL); > @@ -1480,7 +1500,9 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) > r = amdgpu_sdma_sysfs_reset_mask_init(adev); > if (r) > return r; > - INIT_LIST_HEAD(&adev->sdma.reset_callback_list); > + /* Initialize guilty flags for GFX and PAGE queues */ > + adev->sdma.gfx_guilty = false; > + adev->sdma.page_guilty = false; > > return r; > } > @@ -1606,6 +1628,34 @@ static int sdma_v4_4_2_soft_reset(struct amdgpu_ip_block *ip_block) > return 0; > } > > +static bool sdma_v4_4_2_is_queue_selected(struct amdgpu_device *adev, uint32_t instance_id, bool is_page_queue) > +{ > + uint32_t reg_offset = is_page_queue ? regSDMA_PAGE_CONTEXT_STATUS : regSDMA_GFX_CONTEXT_STATUS; > + uint32_t context_status = RREG32(sdma_v4_4_2_get_reg_offset(adev, instance_id, reg_offset)); > + > + /* Check if the SELECTED bit is set */ > + return (context_status & SDMA_GFX_CONTEXT_STATUS__SELECTED_MASK) != 0; > +} > + > +static bool sdma_v4_4_2_ring_is_guilty(struct amdgpu_ring *ring) > +{ > + struct amdgpu_device *adev = ring->adev; > + uint32_t instance_id = ring->me; > + > + return sdma_v4_4_2_is_queue_selected(adev, instance_id, false); > +} > + > +static bool sdma_v4_4_2_page_ring_is_guilty(struct amdgpu_ring *ring) > +{ > + struct amdgpu_device *adev = ring->adev; > + uint32_t instance_id = ring->me; > + > + if (adev->sdma.has_page_queue) > + return false; > + > + return sdma_v4_4_2_is_queue_selected(adev, instance_id, true); > +} > + > static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid) > { > struct amdgpu_device *adev = ring->adev; > @@ -1616,11 +1666,29 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid) > static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id, int src) > { > u32 inst_mask; > + uint64_t rptr; > struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring; > > if (amdgpu_sriov_vf(adev)) > return -EINVAL; > > + /* Check if this queue is the guilty one */ > + adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false); > + if (adev->sdma.has_page_queue) > + adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true); > + > + /* Cache the rptr before reset, after the reset, > + * all of the registers will be reset to 0 > + */ > + rptr = amdgpu_ring_get_rptr(ring); > + ring->cached_rptr = rptr; > + /* Cache the rptr for the page queue if it exists */ > + if (adev->sdma.has_page_queue) { > + struct amdgpu_ring *page_ring = &adev->sdma.instance[instance_id].page; > + rptr = amdgpu_ring_get_rptr(page_ring); > + page_ring->cached_rptr = rptr; > + } > + > /* stop queue */ > inst_mask = 1 << ring->me; > sdma_v4_4_2_inst_gfx_stop(adev, inst_mask); > @@ -2055,6 +2123,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = { > .emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait, > .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, > .reset = sdma_v4_4_2_reset_queue, > + .is_guilty = sdma_v4_4_2_ring_is_guilty, > }; > > static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = { > @@ -2086,6 +2155,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = { > .emit_wreg = sdma_v4_4_2_ring_emit_wreg, > .emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait, > .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, > + .is_guilty = sdma_v4_4_2_page_ring_is_guilty, > }; > > static void sdma_v4_4_2_set_ring_funcs(struct amdgpu_device *adev)