The gfx and page queues are per instance, so track them per instance. v2: drop extra paramter (Lijo) Fixes: fdbfaaaae06b ("drm/amdgpu: Improve SDMA reset logic with guilty queue tracking") Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 7 +++--- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 27 ++++++++++++------------ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index 8e7e794ff136f..dc1a81c2f9af7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -65,6 +65,10 @@ struct amdgpu_sdma_instance { uint64_t sdma_fw_gpu_addr; uint32_t *sdma_fw_ptr; struct mutex engine_reset_mutex; + /* track guilty state of GFX and PAGE queues */ + bool gfx_guilty; + bool page_guilty; + }; enum amdgpu_sdma_ras_memory_id { @@ -127,9 +131,6 @@ struct amdgpu_sdma { uint32_t *ip_dump; uint32_t supported_reset; struct list_head reset_callback_list; - /* track guilty state of GFX and PAGE queues */ - bool gfx_guilty; - bool page_guilty; }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 927db7a080f0a..def68f4802089 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -672,12 +672,11 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl) * @adev: amdgpu_device pointer * @i: instance to resume * @restore: used to restore wptr when restart - * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error) * * Set up the gfx DMA ring buffers and enable them. * Returns 0 for success, error for failure. */ -static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty) +static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore) { struct amdgpu_ring *ring = &adev->sdma.instance[i].ring; u32 rb_cntl, ib_cntl, wptr_poll_cntl; @@ -714,7 +713,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b /* For the guilty queue, set RPTR to the current wptr to skip bad commands, * It is not a guilty queue, restore cache_rptr and continue execution. */ - if (guilty) + if (adev->sdma.instance[i].gfx_guilty) rwptr = ring->wptr; else rwptr = ring->cached_rptr; @@ -779,12 +778,11 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b * @adev: amdgpu_device pointer * @i: instance to resume * @restore: boolean to say restore needed or not - * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error) * * Set up the page DMA ring buffers and enable them. * Returns 0 for success, error for failure. */ -static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty) +static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore) { struct amdgpu_ring *ring = &adev->sdma.instance[i].page; u32 rb_cntl, ib_cntl, wptr_poll_cntl; @@ -803,7 +801,7 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, /* For the guilty queue, set RPTR to the current wptr to skip bad commands, * It is not a guilty queue, restore cache_rptr and continue execution. */ - if (guilty) + if (adev->sdma.instance[i].page_guilty) rwptr = ring->wptr; else rwptr = ring->cached_rptr; @@ -989,9 +987,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev, uint32_t temp; WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0); - sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty); + sdma_v4_4_2_gfx_resume(adev, i, restore); if (adev->sdma.has_page_queue) - sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty); + sdma_v4_4_2_page_resume(adev, i, restore); /* set utc l1 enable flag always to 1 */ temp = RREG32_SDMA(i, regSDMA_CNTL); @@ -1446,6 +1444,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) for (i = 0; i < adev->sdma.num_instances; i++) { mutex_init(&adev->sdma.instance[i].engine_reset_mutex); + /* Initialize guilty flags for GFX and PAGE queues */ + adev->sdma.instance[i].gfx_guilty = false; + adev->sdma.instance[i].page_guilty = false; + ring = &adev->sdma.instance[i].ring; ring->ring_obj = NULL; ring->use_doorbell = true; @@ -1507,9 +1509,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block) r = amdgpu_sdma_sysfs_reset_mask_init(adev); if (r) return r; - /* Initialize guilty flags for GFX and PAGE queues */ - adev->sdma.gfx_guilty = false; - adev->sdma.page_guilty = false; return r; } @@ -1686,9 +1685,11 @@ static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_ return -EINVAL; /* Check if this queue is the guilty one */ - adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false); + adev->sdma.instance[instance_id].gfx_guilty = + sdma_v4_4_2_is_queue_selected(adev, instance_id, false); if (adev->sdma.has_page_queue) - adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true); + adev->sdma.instance[instance_id].page_guilty = + sdma_v4_4_2_is_queue_selected(adev, instance_id, true); /* Cache the rptr before reset, after the reset, * all of the registers will be reset to 0 -- 2.48.1