Re: [PATCH 2/2] drm/amdgpu/sdma: guilty tracking is per instance

"Lazar, Lijo" <lijo.lazar@xxxxxxx> · Mon, 17 Mar 2025 20:34:47 +0530



On 3/17/2025 8:16 PM, Alex Deucher wrote:
> The gfx and page queues are per instance, so track them
> per instance.
> 
> v2: drop extra paramter (Lijo)
> 
> Fixes: fdbfaaaae06b ("drm/amdgpu: Improve SDMA reset logic with guilty queue tracking")
> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>

Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx>

Thanks,
Lijo

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  7 +++---
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 27 ++++++++++++------------
>  2 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 8e7e794ff136f..dc1a81c2f9af7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -65,6 +65,10 @@ struct amdgpu_sdma_instance {
>  	uint64_t		sdma_fw_gpu_addr;
>  	uint32_t		*sdma_fw_ptr;
>  	struct mutex		engine_reset_mutex;
> +	/* track guilty state of GFX and PAGE queues */
> +	bool			gfx_guilty;
> +	bool			page_guilty;
> +
>  };
>  
>  enum amdgpu_sdma_ras_memory_id {
> @@ -127,9 +131,6 @@ struct amdgpu_sdma {
>  	uint32_t		*ip_dump;
>  	uint32_t 		supported_reset;
>  	struct list_head	reset_callback_list;
> -	/* track guilty state of GFX and PAGE queues */
> -	bool gfx_guilty;
> -	bool page_guilty;
>  };
>  
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index 927db7a080f0a..def68f4802089 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -672,12 +672,11 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl)
>   * @adev: amdgpu_device pointer
>   * @i: instance to resume
>   * @restore: used to restore wptr when restart
> - * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
>   *
>   * Set up the gfx DMA ring buffers and enable them.
>   * Returns 0 for success, error for failure.
>   */
> -static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
> +static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
>  {
>  	struct amdgpu_ring *ring = &adev->sdma.instance[i].ring;
>  	u32 rb_cntl, ib_cntl, wptr_poll_cntl;
> @@ -714,7 +713,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
>  	/* For the guilty queue, set RPTR to the current wptr to skip bad commands,
>  	 * It is not a guilty queue, restore cache_rptr and continue execution.
>  	 */
> -	if (guilty)
> +	if (adev->sdma.instance[i].gfx_guilty)
>  		rwptr = ring->wptr;
>  	else
>  		rwptr = ring->cached_rptr;
> @@ -779,12 +778,11 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
>   * @adev: amdgpu_device pointer
>   * @i: instance to resume
>   * @restore: boolean to say restore needed or not
> - * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
>   *
>   * Set up the page DMA ring buffers and enable them.
>   * Returns 0 for success, error for failure.
>   */
> -static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
> +static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
>  {
>  	struct amdgpu_ring *ring = &adev->sdma.instance[i].page;
>  	u32 rb_cntl, ib_cntl, wptr_poll_cntl;
> @@ -803,7 +801,7 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i,
>  	/* For the guilty queue, set RPTR to the current wptr to skip bad commands,
>  	 * It is not a guilty queue, restore cache_rptr and continue execution.
>  	 */
> -	if (guilty)
> +	if (adev->sdma.instance[i].page_guilty)
>  		rwptr = ring->wptr;
>  	else
>  		rwptr = ring->cached_rptr;
> @@ -989,9 +987,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev,
>  		uint32_t temp;
>  
>  		WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0);
> -		sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty);
> +		sdma_v4_4_2_gfx_resume(adev, i, restore);
>  		if (adev->sdma.has_page_queue)
> -			sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty);
> +			sdma_v4_4_2_page_resume(adev, i, restore);
>  
>  		/* set utc l1 enable flag always to 1 */
>  		temp = RREG32_SDMA(i, regSDMA_CNTL);
> @@ -1446,6 +1444,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
>  
>  	for (i = 0; i < adev->sdma.num_instances; i++) {
>  		mutex_init(&adev->sdma.instance[i].engine_reset_mutex);
> +		/* Initialize guilty flags for GFX and PAGE queues */
> +		adev->sdma.instance[i].gfx_guilty = false;
> +		adev->sdma.instance[i].page_guilty = false;
> +
>  		ring = &adev->sdma.instance[i].ring;
>  		ring->ring_obj = NULL;
>  		ring->use_doorbell = true;
> @@ -1507,9 +1509,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
>  	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
>  	if (r)
>  		return r;
> -	/* Initialize guilty flags for GFX and PAGE queues */
> -	adev->sdma.gfx_guilty = false;
> -	adev->sdma.page_guilty = false;
>  
>  	return r;
>  }
> @@ -1686,9 +1685,11 @@ static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_
>  		return -EINVAL;
>  
>  	/* Check if this queue is the guilty one */
> -	adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
> +	adev->sdma.instance[instance_id].gfx_guilty =
> +		sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
>  	if (adev->sdma.has_page_queue)
> -		adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
> +		adev->sdma.instance[instance_id].page_guilty =
> +			sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
>  
>  	/* Cache the rptr before reset, after the reset,
>  	* all of the registers will be reset to 0