Once ras recovery is issued by ras sync flood interrupt or ras controller interrupt, add this guard to bypass or execute ras error count register harvest of all IPs. Signed-off-by: Guchun Chen <guchun.chen@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 48 +++++++++++++------------ 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 89cb0ae9da9d..e6978b8e2143 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1547,17 +1547,19 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct list_head device_list, *device_list_handle = NULL; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); - /* Build list of devices to query RAS related errors */ - if (hive && adev->gmc.xgmi.num_physical_nodes > 1) - device_list_handle = &hive->device_list; - else { - INIT_LIST_HEAD(&device_list); - list_add_tail(&adev->gmc.xgmi.head, &device_list); - device_list_handle = &device_list; - } + if (!ras->disable_ras_err_cnt_harvest) { + /* Build list of devices to query RAS related errors */ + if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { + device_list_handle = &hive->device_list; + } else { + INIT_LIST_HEAD(&device_list); + list_add_tail(&adev->gmc.xgmi.head, &device_list); + device_list_handle = &device_list; + } - list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { - amdgpu_ras_log_on_err_counter(remote_adev); + list_for_each_entry(remote_adev, + device_list_handle, gmc.xgmi.head) + amdgpu_ras_log_on_err_counter(remote_adev); } if (amdgpu_device_should_recover_gpu(ras->adev)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 70a6fca73617..6b8d7bb83bb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -343,6 +343,9 @@ struct amdgpu_ras { /* bad page count threshold */ uint32_t bad_page_cnt_threshold; + + /* disable ras error count harvest in recovery */ + bool disable_ras_err_cnt_harvest; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index e629156173d3..eadc9526d33f 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -302,6 +302,7 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device uint32_t bif_doorbell_intr_cntl; struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); struct ras_err_data err_data = {0, 0, 0, NULL}; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL); if (REG_GET_FIELD(bif_doorbell_intr_cntl, @@ -312,28 +313,31 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device RAS_CNTLR_INTERRUPT_CLEAR, 1); WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); - /* - * clear error status after ras_controller_intr according to - * hw team and count ue number for query - */ - nbio_v7_4_query_ras_error_count(adev, &err_data); - - /* logging on error counter and printing for awareness */ - obj->err_data.ue_count += err_data.ue_count; - obj->err_data.ce_count += err_data.ce_count; - - if (err_data.ce_count) - dev_info(adev->dev, "%ld correctable hardware " - "errors detected in %s block, " - "no user action is needed.\n", - obj->err_data.ce_count, - adev->nbio.ras_if->name); - - if (err_data.ue_count) - dev_info(adev->dev, "%ld uncorrectable hardware " - "errors detected in %s block\n", - obj->err_data.ue_count, - adev->nbio.ras_if->name); + if (!ras->disable_ras_err_cnt_harvest) { + /* + * clear error status after ras_controller_intr + * according to hw team and count ue number + * for query + */ + nbio_v7_4_query_ras_error_count(adev, &err_data); + + /* logging on error cnt and printing for awareness */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + + if (err_data.ce_count) + dev_info(adev->dev, "%ld correctable hardware " + "errors detected in %s block, " + "no user action is needed.\n", + obj->err_data.ce_count, + adev->nbio.ras_if->name); + + if (err_data.ue_count) + dev_info(adev->dev, "%ld uncorrectable hardware " + "errors detected in %s block\n", + obj->err_data.ue_count, + adev->nbio.ras_if->name); + } dev_info(adev->dev, "RAS controller interrupt triggered " "by NBIF error\n"); -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx