[AMD Official Use Only - General] Series is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao Zhou Sent: Monday, March 18, 2024 15:26 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH 3/3] drm/amdgpu: make reset method configurable for RAS poison Each RAS block has different requirement for gpu reset in poison consumption handling. Add support for mmhub RAS poison consumption handling. v2: remove the mmhub poison support for kfd int v10. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 4 ++-- .../gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 13 +++++++----- .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 9 +++++---- .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 20 ++++++++++++++----- 8 files changed, 40 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 0b4910108f61..66753940bb4d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -760,7 +760,7 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev) } void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset) + enum amdgpu_ras_block block, uint32_t reset) { amdgpu_umc_poison_handler(adev, block, reset); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 03bf20e0e3da..ad50c7bbc326 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -400,7 +400,7 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev); int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset); + enum amdgpu_ras_block block, uint32_t reset); bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e32a186c2de1..58fe7bebdf1b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2045,7 +2045,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * } } - amdgpu_umc_poison_handler(adev, obj->head.block, false); + amdgpu_umc_poison_handler(adev, obj->head.block, 0); if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); @@ -2698,7 +2698,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_dec(&con->page_retirement_req_cnt); amdgpu_umc_bad_page_polling_timeout(adev, - false, MAX_UMC_POISON_POLLING_TIME_ASYNC); + 0, MAX_UMC_POISON_POLLING_TIME_ASYNC); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 20436f81856a..2c02585dcbff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -186,9 +186,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, amdgpu_umc_handle_bad_pages(adev, ras_error_status); if (err_data->ue_count && reset) { - /* use mode-2 reset for poison consumption */ - if (!entry) - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -196,7 +194,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, } int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, - bool reset, uint32_t timeout_ms) + uint32_t reset, uint32_t timeout_ms) { struct ras_err_data err_data; struct ras_common_if head = { @@ -238,8 +236,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, if (reset) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - /* use mode-2 reset for poison consumption */ - con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; + con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -247,7 +244,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, } int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset) + enum amdgpu_ras_block block, uint32_t reset) { int ret = AMDGPU_RAS_SUCCESS; @@ -311,7 +308,8 @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, void *ras_error_status, struct amdgpu_iv_entry *entry) { - return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); + return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, + AMDGPU_RAS_GPU_RESET_MODE1_RESET); } int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 26d2ae498daf..4365a20eeb49 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -103,7 +103,7 @@ struct amdgpu_umc { int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev); int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block); int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - enum amdgpu_ras_block block, bool reset); + enum amdgpu_ras_block block, uint32_t reset); int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry); @@ -123,5 +123,5 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev, umc_func func, void *data); int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, - bool reset, uint32_t timeout_ms); + uint32_t reset, uint32_t timeout_ms); #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index 650da18b0d87..740f89aafbc0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -134,6 +134,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, { enum amdgpu_ras_block block = 0; int old_poison, ret = -EINVAL; + uint32_t reset = 0; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); if (!p) @@ -153,6 +154,8 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, case SOC15_IH_CLIENTID_UTCL2: ret = kfd_dqm_evict_pasid(dev->dqm, pasid); block = AMDGPU_RAS_BLOCK__GFX; + if (ret) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA1: @@ -160,6 +163,7 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: block = AMDGPU_RAS_BLOCK__SDMA; + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; default: break; @@ -170,17 +174,16 @@ static void event_interrupt_poison_consumption(struct kfd_node *dev, /* resetting queue passes, do page retirement without gpu reset * resetting queue fails, fallback to gpu reset solution */ - if (!ret) { + if (!ret) dev_warn(dev->adev->dev, "RAS poison consumption, unmap queue flow succeeded: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); - } else { + else dev_warn(dev->adev->dev, "RAS poison consumption, fall back to gpu reset flow: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); - } + + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); } static bool event_interrupt_isr_v10(struct kfd_node *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index 7e2859736a55..d3d6b5c180b3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -193,6 +193,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, { enum amdgpu_ras_block block = 0; int ret = -EINVAL; + uint32_t reset = 0; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); if (!p) @@ -212,10 +213,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, if (dev->dqm->ops.reset_queues) ret = dev->dqm->ops.reset_queues(dev->dqm, pasid); block = AMDGPU_RAS_BLOCK__GFX; + if (ret) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; case SOC21_INTSRC_SDMA_ECC: default: block = AMDGPU_RAS_BLOCK__GFX; + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; } @@ -223,10 +227,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, /* resetting queue passes, do page retirement without gpu reset resetting queue fails, fallback to gpu reset solution */ - if (!ret) - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); - else - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); } static bool event_interrupt_isr_v11(struct kfd_node *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 11641f4645e6..2a37ab7a7150 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, { enum amdgpu_ras_block block = 0; int old_poison, ret = -EINVAL; + uint32_t reset = 0; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); if (!p) @@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_UTCL2: ret = kfd_dqm_evict_pasid(dev->dqm, pasid); block = AMDGPU_RAS_BLOCK__GFX; + if (ret) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; + break; + case SOC15_IH_CLIENTID_VMC: + case SOC15_IH_CLIENTID_VMC1: + ret = kfd_dqm_evict_pasid(dev->dqm, pasid); + block = AMDGPU_RAS_BLOCK__MMHUB; + if (ret) + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; break; case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA1: @@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, case SOC15_IH_CLIENTID_SDMA3: case SOC15_IH_CLIENTID_SDMA4: block = AMDGPU_RAS_BLOCK__SDMA; + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; default: break; @@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, /* resetting queue passes, do page retirement without gpu reset * resetting queue fails, fallback to gpu reset solution */ - if (!ret) { + if (!ret) dev_warn(dev->adev->dev, "RAS poison consumption, unmap queue flow succeeded: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false); - } else { + else dev_warn(dev->adev->dev, "RAS poison consumption, fall back to gpu reset flow: client id %d\n", client_id); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true); - } + + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); } static bool context_id_expected(struct kfd_dev *dev) -- 2.34.1