On Fri, Jul 2, 2021 at 7:05 PM Luben Tuikov <luben.tuikov@xxxxxxx> wrote: > > In amdgpu_ras_query_error_count() return an error > if the device doesn't support RAS. This prevents > that function from having to always set the values > of the integer pointers (if set), and thus > prevents function side effects--always to have to > set values of integers if integer pointers set, > regardless of whether RAS is supported or > not--with this change this side effect is > mitigated. > > Also, if no pointers are set, don't count, since > we've no way of reporting the counts. > > Also, give this function a kernel-doc. > > Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx> > Cc: John Clements <john.clements@xxxxxxx> > Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx> > Reported-by: Tom Rix <trix@xxxxxxxxxx> > Fixes: a46751fbcde505 ("drm/amdgpu: Fix RAS function interface") > Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 ++++++++++++++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 +-- > 2 files changed, 38 insertions(+), 17 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index c6ae63893dbdb2..ed698b2be79023 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -813,7 +813,7 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, > > /* query/inject/cure begin */ > int amdgpu_ras_query_error_status(struct amdgpu_device *adev, > - struct ras_query_if *info) > + struct ras_query_if *info) > { > struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); > struct ras_err_data err_data = {0, 0, 0, NULL}; > @@ -1047,17 +1047,32 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, > return ret; > } > > -/* get the total error counts on all IPs */ > -void amdgpu_ras_query_error_count(struct amdgpu_device *adev, > - unsigned long *ce_count, > - unsigned long *ue_count) > +/** > + * amdgpu_ras_query_error_count -- Get error counts of all IPs > + * adev: pointer to AMD GPU device > + * ce_count: pointer to an integer to be set to the count of correctible errors. > + * ue_count: pointer to an integer to be set to the count of uncorrectible > + * errors. > + * > + * If set, @ce_count or @ue_count, count and return the corresponding > + * error counts in those integer pointers. Return 0 if the device > + * supports RAS. Return -EINVAL if the device doesn't support RAS. > + */ > +int amdgpu_ras_query_error_count(struct amdgpu_device *adev, > + unsigned long *ce_count, > + unsigned long *ue_count) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_manager *obj; > unsigned long ce, ue; > > if (!adev->ras_enabled || !con) > - return; > + return -EINVAL; Maybe return -ENOTSUPP or -ENODEV here or something like that since the device doesn't support RAS in that case? Other than that, looks good to me. Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> Alex > + > + /* Don't count since no reporting. > + */ > + if (!ce_count && !ue_count) > + return 0; > > ce = 0; > ue = 0; > @@ -1065,9 +1080,11 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, > struct ras_query_if info = { > .head = obj->head, > }; > + int res; > > - if (amdgpu_ras_query_error_status(adev, &info)) > - return; > + res = amdgpu_ras_query_error_status(adev, &info); > + if (res) > + return res; > > ce += info.ce_count; > ue += info.ue_count; > @@ -1078,6 +1095,8 @@ void amdgpu_ras_query_error_count(struct amdgpu_device *adev, > > if (ue_count) > *ue_count = ue; > + > + return 0; > } > /* query/inject/cure end */ > > @@ -2145,9 +2164,10 @@ static void amdgpu_ras_counte_dw(struct work_struct *work) > > /* Cache new values. > */ > - amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); > - atomic_set(&con->ras_ce_count, ce_count); > - atomic_set(&con->ras_ue_count, ue_count); > + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { > + atomic_set(&con->ras_ce_count, ce_count); > + atomic_set(&con->ras_ue_count, ue_count); > + } > > pm_runtime_mark_last_busy(dev->dev); > Out: > @@ -2320,9 +2340,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, > > /* Those are the cached values at init. > */ > - amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); > - atomic_set(&con->ras_ce_count, ce_count); > - atomic_set(&con->ras_ue_count, ue_count); > + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) { > + atomic_set(&con->ras_ce_count, ce_count); > + atomic_set(&con->ras_ue_count, ue_count); > + } > > return 0; > cleanup: > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 283afd791db107..4d9c63f2f37718 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -491,9 +491,9 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, > void amdgpu_ras_resume(struct amdgpu_device *adev); > void amdgpu_ras_suspend(struct amdgpu_device *adev); > > -void amdgpu_ras_query_error_count(struct amdgpu_device *adev, > - unsigned long *ce_count, > - unsigned long *ue_count); > +int amdgpu_ras_query_error_count(struct amdgpu_device *adev, > + unsigned long *ce_count, > + unsigned long *ue_count); > > /* error handling functions */ > int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, > -- > 2.32.0 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx