On 2021-07-06 11:25 a.m., Alex Deucher wrote: > On Fri, Jul 2, 2021 at 7:05 PM Luben Tuikov <luben.tuikov@xxxxxxx> wrote: >> In amdgpu_ras_query_error_count() return an error >> if the device doesn't support RAS. This prevents >> that function from having to always set the values >> of the integer pointers (if set), and thus >> prevents function side effects--always to have to >> set values of integers if integer pointers set, >> regardless of whether RAS is supported or >> not--with this change this side effect is >> mitigated. >> >> Also, if no pointers are set, don't count, since >> we've no way of reporting the counts. >> >> Also, give this function a kernel-doc. >> >> Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx> >> Cc: John Clements <john.clements@xxxxxxx> >> Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx> >> Reported-by: Tom Rix <trix@xxxxxxxxxx> >> Fixes: a46751fbcde505 ("drm/amdgpu: Fix RAS function interface") >> Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 ++++++++++++++++++------- >> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 +-- >> 2 files changed, 38 insertions(+), 17 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> index c6ae63893dbdb2..ed698b2be79023 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c >> @@ -813,7 +813,7 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, >> >> /* query/inject/cure begin */ >> int amdgpu_ras_query_error_status(struct amdgpu_device *adev, >> - struct ras_query_if *info) >> + struct ras_query_if *info) >> { >> struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); >> struct ras_err_data err_data = {0, 0, 0, NULL}; >> @@ -1047,17 +1047,32 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, >> return ret; >> } >> >> -/* get the total error counts on all IPs */ >> -void amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> - unsigned long *ce_count, >> - unsigned long *ue_count) >> +/** >> + * amdgpu_ras_query_error_count -- Get error counts of all IPs >> + * adev: pointer to AMD GPU device >> + * ce_count: pointer to an integer to be set to the count of correctible errors. >> + * ue_count: pointer to an integer to be set to the count of uncorrectible >> + * errors. >> + * >> + * If set, @ce_count or @ue_count, count and return the corresponding >> + * error counts in those integer pointers. Return 0 if the device >> + * supports RAS. Return -EINVAL if the device doesn't support RAS. >> + */ >> +int amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> + unsigned long *ce_count, >> + unsigned long *ue_count) >> { >> struct amdgpu_ras *con = amdgpu_ras_get_context(adev); >> struct ras_manager *obj; >> unsigned long ce, ue; >> >> if (!adev->ras_enabled || !con) >> - return; >> + return -EINVAL; > Maybe return -ENOTSUPP or -ENODEV here or something like that since > the device doesn't support RAS in that case? Other than that, looks > good to me. > Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> Okay, great. I thought about -ENODEV but didn't want to confuse the IOCTL/userspace that the device isn't there, so I'll then return -EOPNOTSUPP, add your R-B and push it. Regards, Luben > > Alex > > >> + >> + /* Don't count since no reporting. >> + */ >> + if (!ce_count && !ue_count) >> + return 0; >> >> ce = 0; >> ue = 0; >> @@ -1065,9 +1080,11 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> struct ras_query_if info = { >> .head = obj->head, >> }; >> + int res; >> >> - if (amdgpu_ras_query_error_status(adev, &info)) >> - return; >> + res = amdgpu_ras_query_error_status(adev, &info); >> + if (res) >> + return res; >> >> ce += info.ce_count; >> ue += info.ue_count; >> @@ -1078,6 +1095,8 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> >> if (ue_count) >> *ue_count = ue; >> + >> + return 0; >> } >> /* query/inject/cure end */ >> >> @@ -2145,9 +2164,10 @@ static void amdgpu_ras_counte_dw(struct work_struct *work) >> >> /* Cache new values. >> */ >> - amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); >> - atomic_set(&con->ras_ce_count, ce_count); >> - atomic_set(&con->ras_ue_count, ue_count); >> + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { >> + atomic_set(&con->ras_ce_count, ce_count); >> + atomic_set(&con->ras_ue_count, ue_count); >> + } >> >> pm_runtime_mark_last_busy(dev->dev); >> Out: >> @@ -2320,9 +2340,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, >> >> /* Those are the cached values at init. >> */ >> - amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); >> - atomic_set(&con->ras_ce_count, ce_count); >> - atomic_set(&con->ras_ue_count, ue_count); >> + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { >> + atomic_set(&con->ras_ce_count, ce_count); >> + atomic_set(&con->ras_ue_count, ue_count); >> + } >> >> return 0; >> cleanup: >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> index 283afd791db107..4d9c63f2f37718 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h >> @@ -491,9 +491,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, >> void amdgpu_ras_resume(struct amdgpu_device *adev); >> void amdgpu_ras_suspend(struct amdgpu_device *adev); >> >> -void amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> - unsigned long *ce_count, >> - unsigned long *ue_count); >> +int amdgpu_ras_query_error_count(struct amdgpu_device *adev, >> + unsigned long *ce_count, >> + unsigned long *ue_count); >> >> /* error handling functions */ >> int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, >> -- >> 2.32.0 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7Cluben.tuikov%40amd.com%7C9ad19b88bf0d4f471fed08d940926367%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637611819760889211%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=ldf4psDK2bA9Bvln2yLQ0ycmrJk2KP82v5qfwhjovNo%3D&reserved=0