Support saving bad pages after gpu ras reset for umc_v12_0. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 +++++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 35 ++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 7 +++++ drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 29 ++++++++++++++++++ 5 files changed, 95 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4e4ba2149595..c20c9d6df149 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1026,7 +1026,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, return -EINVAL; if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { - amdgpu_ras_get_ecc_info(adev, &err_data); + if (info->err_data) { + struct ras_err_data *ras_err = (struct ras_err_data *)info->err_data; + + amdgpu_ras_get_ecc_info(adev, ras_err); + err_data.ce_count = ras_err->ce_count; + err_data.ue_count = ras_err->ue_count; + } else { + amdgpu_ras_get_ecc_info(adev, &err_data); + } } else { block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); if (!block_obj || !block_obj->hw_ops) { @@ -1889,6 +1897,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) list_for_each_entry(obj, &con->head, node) { struct ras_query_if info = { .head = obj->head, + .err_data = NULL, }; /* @@ -1906,10 +1915,13 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) * info table failed temporarily. * should be removed until smu fix handle ecc_info table. */ - if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) && - (amdgpu_ip_version(adev, MP1_HWIP, 0) == - IP_VERSION(13, 0, 2))) - continue; + if (info.head.block == AMDGPU_RAS_BLOCK__UMC) { + if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2)) + continue; + + if (adev->umc.err_data.err_addr) + info.err_data = &adev->umc.err_data; + } amdgpu_ras_query_error_status(adev, &info); @@ -2020,6 +2032,18 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, return ret; } +static void amdgpu_ras_data_save(struct amdgpu_device *adev) +{ + if (adev->umc.ras->ecc_data_save) + adev->umc.ras->ecc_data_save(adev); +} + +static void amdgpu_ras_data_restore(struct amdgpu_device *adev) +{ + if (adev->umc.ras->ecc_data_restore) + adev->umc.ras->ecc_data_restore(adev); +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = @@ -2042,6 +2066,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { + amdgpu_ras_data_save(remote_adev); amdgpu_ras_query_err_status(remote_adev); amdgpu_ras_log_on_err_counter(remote_adev); } @@ -2080,6 +2105,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); } + + if (device_list_handle) + list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) + amdgpu_ras_data_restore(remote_adev); + atomic_set(&ras->in_recovery, 0); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 7999d202c9bc..9ee53910a2c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -513,6 +513,7 @@ struct ras_query_if { struct ras_common_if head; unsigned long ue_count; unsigned long ce_count; + void *err_data; }; struct ras_inject_if { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 24fcc9a2e422..7542606e10fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -76,6 +76,27 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, return ret; } +void amdgpu_ras_handle_bad_pages(struct amdgpu_device *adev, + struct ras_err_data *err_data) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!err_data || !err_data->err_addr || !err_data->err_addr_cnt) + return; + + amdgpu_ras_add_bad_pages(adev, err_data->err_addr, + err_data->err_addr_cnt); + + amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count)); + + amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); + + if (con->update_channel_flag == true) { + amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); + con->update_channel_flag = false; + } +} + static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, void *ras_error_status, struct amdgpu_iv_entry *entry, @@ -144,18 +165,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, err_data->ue_count); if ((amdgpu_bad_page_threshold != 0) && - err_data->err_addr_cnt) { - amdgpu_ras_add_bad_pages(adev, err_data->err_addr, - err_data->err_addr_cnt); - amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count)); - - amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); - - if (con->update_channel_flag == true) { - amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); - con->update_channel_flag = false; - } - } + err_data->err_addr_cnt) + amdgpu_ras_handle_bad_pages(adev, err_data); if (reset) amdgpu_ras_reset_gpu(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 417a6726c71b..447d8785008c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -66,6 +66,9 @@ struct amdgpu_umc_ras { void *ras_error_status); /* support different eeprom table version for different asic */ void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr); + + void (*ecc_data_save)(struct amdgpu_device *adev); + void (*ecc_data_restore)(struct amdgpu_device *adev); }; struct amdgpu_umc_funcs { @@ -93,6 +96,7 @@ struct amdgpu_umc { const struct amdgpu_umc_funcs *funcs; struct amdgpu_umc_ras *ras; + struct ras_err_data err_data; /* active mask for umc node instance */ unsigned long active_mask; @@ -118,4 +122,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, int amdgpu_umc_loop_channels(struct amdgpu_device *adev, umc_func func, void *data); + +void amdgpu_ras_handle_bad_pages(struct amdgpu_device *adev, + struct ras_err_data *err_data); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index c6742dd863d4..1fb78561f0fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -351,6 +351,33 @@ static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev) return true; } +static void umc_v12_0_ecc_data_save(struct amdgpu_device *adev) +{ + adev->umc.err_data.err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + + adev->umc.err_data.ce_count = 0; + adev->umc.err_data.ue_count = 0; + adev->umc.err_data.err_addr_cnt = 0; +} + +static void umc_v12_0_ecc_data_restore(struct amdgpu_device *adev) +{ + if (adev->umc.err_data.ue_count && + adev->umc.err_data.err_addr_cnt && + adev->umc.err_data.err_addr) { + amdgpu_ras_handle_bad_pages(adev, &adev->umc.err_data); + } + + kfree(adev->umc.err_data.err_addr); + + adev->umc.err_data.err_addr = NULL; + adev->umc.err_data.ce_count = 0; + adev->umc.err_data.ue_count = 0; + adev->umc.err_data.err_addr_cnt = 0; +} + const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = { .query_ras_error_count = umc_v12_0_query_ras_error_count, .query_ras_error_address = umc_v12_0_query_ras_error_address, @@ -362,4 +389,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = { }, .err_cnt_init = umc_v12_0_err_cnt_init, .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, + .ecc_data_save = umc_v12_0_ecc_data_save, + .ecc_data_restore = umc_v12_0_ecc_data_restore, }; -- 2.34.1