[Public] Is this applicable for aldebaran also? Thanks, Lijo -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Hawking Zhang Sent: Tuesday, April 16, 2024 11:46 AM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx> Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison mode-2 reset is the only reliable method that can get GC/SDMA back when poison is consumed. mmhub requires mode-1 reset. Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> --- .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 27 ++++++------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index c368c70df3f4a..c3beb872adf8d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, uint16_t pasid, uint16_t client_id) { enum amdgpu_ras_block block = 0; - int old_poison, ret = -EINVAL; + int old_poison; uint32_t reset = 0; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); @@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SE2SH: case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_UTCL2: - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); block = AMDGPU_RAS_BLOCK__GFX; - if (ret) - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; case SOC15_IH_CLIENTID_VMC: case SOC15_IH_CLIENTID_VMC1: - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); block = AMDGPU_RAS_BLOCK__MMHUB; - if (ret) - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; break; case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA1: @@ -184,22 +180,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; default: - break; + dev_warn(dev->adev->dev, + "client %d does not support poison consumption\n", client_id); + return; } kfd_signal_poison_consumed_event(dev, pasid); - /* resetting queue passes, do page retirement without gpu reset - * resetting queue fails, fallback to gpu reset solution - */ - if (!ret) - dev_warn(dev->adev->dev, - "RAS poison consumption, unmap queue flow succeeded: client id %d\n", - client_id); - else - dev_warn(dev->adev->dev, - "RAS poison consumption, fall back to gpu reset flow: client id %d\n", - client_id); + dev_warn(dev->adev->dev, + "poison is consumed by client %d, kick off gpu reset flow\n", +client_id); amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); } -- 2.17.1