[AMD Official Use Only - AMD Internal Distribution Only] Please correct the commit subject before pushing the change drma->drm Regards, Hawking -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao Zhou Sent: Friday, May 31, 2024 18:49 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH 4/5] drma/amdgpu: set fatal flag for RAS recovery PMFW needs the flag to know the reason of mode1. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 2 +- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 2 +- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index fb5fc1fe6ad0..f55bff59052f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -940,7 +940,7 @@ int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev, if (adev->gfx.ras && adev->gfx.ras->ras_block.hw_ops && adev->gfx.ras->ras_block.hw_ops->query_ras_error_count) adev->gfx.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, true); } return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ff2d34dc9718..2071e30d7e56 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2070,7 +2070,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * if (poison_stat && !con->is_rma) { dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", block_obj->ras_comm.name); - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); } if (!poison_stat) @@ -2825,7 +2825,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_fini(&err_data); if (err_cnt && con->is_rma) - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); mutex_lock(&con->umc_ecc_log.lock); if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree, @@ -2888,7 +2888,7 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); } return 0; @@ -3815,7 +3815,7 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) amdgpu_ras_set_fed(adev, true); ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, true); } } @@ -3996,7 +3996,7 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev, return ret; } -int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 37e1c93c243d..ed5793458a70 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -878,7 +878,7 @@ bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev); int amdgpu_ras_is_supported(struct amdgpu_device *adev, unsigned int block); -int amdgpu_ras_reset_gpu(struct amdgpu_device *adev); +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, bool fatal); struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 151f83ea803b..f976b6deb42d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -129,7 +129,7 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev, if (amdgpu_sriov_vf(adev)) return AMDGPU_RAS_SUCCESS; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, true); return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 4a72ff8d8d80..2596a1c2a64e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -198,7 +198,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, if ((err_data->ue_count || err_data->de_count) && (reset || (con && con->is_rma))) { con->gpu_reset_flags |= reset; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); } return AMDGPU_RAS_SUCCESS; @@ -247,7 +247,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, if (reset || (err_data.err_addr_cnt && con && con->is_rma)) { con->gpu_reset_flags |= reset; - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); } return 0; @@ -266,7 +266,7 @@ int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, * let MCA notifier do page retirement. */ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); } return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c index 9cd221ed240c..07c24704c4b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c @@ -98,7 +98,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev, } if (con && !con->is_rma) - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, false); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index b8fc9e126e0d..0935ed57a906 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -414,7 +414,7 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device /* ras_controller_int is dedicated for nbif ras error, * not the global interrupt for sync flood */ - amdgpu_ras_reset_gpu(adev); + amdgpu_ras_reset_gpu(adev, true); } amdgpu_ras_error_data_fini(&err_data); -- 2.34.1