Define page retirement functions for MCA platform. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 112 ++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 2 + drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 2 +- drivers/gpu/drm/amd/amdgpu/umc_v6_7.h | 2 + 4 files changed, 117 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index aad3c8b4c810..e97b1bd343ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -22,6 +22,118 @@ */ #include "amdgpu.h" +#include "umc_v6_7.h" + +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst) +{ + switch (adev->ip_versions[UMC_HWIP][0]) { + case IP_VERSION(6, 7, 0): + umc_v6_7_convert_error_address(adev, + err_data, err_addr, ch_inst, umc_inst); + break; + default: + dev_warn(adev->dev, + "UMC address to Physical address translation is not supported\n"); + return AMDGPU_RAS_FAIL; + } + + return AMDGPU_RAS_SUCCESS; +} + +static int amdgpu_umc_ecc_info_query_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data) +{ + switch (adev->ip_versions[UMC_HWIP][0]) { + case IP_VERSION(6, 7, 0): + umc_v6_7_ecc_info_query_ras_error_address(adev, + err_data); + break; + default: + dev_warn(adev->dev, + "UMC error address query is not supported\n"); + return AMDGPU_RAS_FAIL; + } + + return AMDGPU_RAS_SUCCESS; +} + +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) +{ + struct ras_err_data err_data = {0, 0, 0, NULL}; + int ret = AMDGPU_RAS_FAIL; + + err_data.err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + if (!err_data.err_addr) { + dev_warn(adev->dev, + "Failed to alloc memory for umc error record in MCA notifier!\n"); + return AMDGPU_RAS_FAIL; + } + + /* + * Translate UMC channel address to Physical address + */ + ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, + ch_inst, umc_inst); + if (ret) + goto out; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, + err_data.err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + +out: + kfree(err_data.err_addr); + return ret; +} + +static int amdgpu_umc_poison_handler_mca(struct amdgpu_device *adev, + struct ras_err_data *err_data, bool reset) +{ + int ret = AMDGPU_RAS_FAIL; + + err_data->err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + if (!err_data->err_addr) { + dev_warn(adev->dev, + "Failed to alloc memory for MCA RAS poison handler!\n"); + goto out2; + } + + /* + * Translate UMC channel address to Physical address + */ + ret = amdgpu_umc_ecc_info_query_error_address(adev, err_data); + if (ret) + goto out1; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data->err_addr, + err_data->err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + +out1: + kfree(err_data->err_addr); +out2: + /* trigger gpu reset even error count is 0 for CPU MCA RAS, + * MCA notifier is responsible for page retirement if error + * count can't be queried in poison handler. + */ + if (reset) { + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_ras_reset_gpu(adev); + } + + return ret; +} static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, void *ras_error_status, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 3629d8f292ef..659a10de29c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -98,4 +98,6 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, void *ras_error_status, struct amdgpu_iv_entry *entry); +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c index 72fd963f178b..b0da50d03af6 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c @@ -252,7 +252,7 @@ static void umc_v6_7_ecc_info_query_error_address(struct amdgpu_device *adev, } } -static void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev, +void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev, void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h index 105245d5b6e5..24382e9e5814 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h @@ -74,4 +74,6 @@ extern const uint32_t void umc_v6_7_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst); +void umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev, + void *ras_error_status); #endif -- 2.35.1