When NBIO's RAS error happens, before trigging GPU reset, it's needed to record error counter information, which can correct the error counter value missed issue when reading from debugfs. Signed-off-by: Guchun Chen <guchun.chen@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 65eb378fa035..149d386590df 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -318,6 +318,7 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device { uint32_t bif_doorbell_intr_cntl; struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); + struct ras_err_data err_data = {0, 0, 0, NULL}; bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL); if (REG_GET_FIELD(bif_doorbell_intr_cntl, @@ -332,7 +333,19 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device * clear error status after ras_controller_intr according to * hw team and count ue number for query */ - nbio_v7_4_query_ras_error_count(adev, &obj->err_data); + nbio_v7_4_query_ras_error_count(adev, &err_data); + + /* logging on error counter and printing for awareness */ + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + + if (err_data.ce_count) + DRM_INFO("%ld correctable errors detected in %s block\n", + obj->err_data.ce_count, adev->nbio.ras_if->name); + + if (err_data.ue_count) + DRM_INFO("%ld uncorrectable errors detected in %s block\n", + obj->err_data.ue_count, adev->nbio.ras_if->name); DRM_WARN("RAS controller interrupt triggered by NBIF error\n"); -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx