[AMD Official Use Only - General] Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Sent: Wednesday, January 4, 2023 12:25 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Yang, > Stanley <Stanley.Yang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Chai, > Thomas <YiPeng.Chai@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: allow query error counters for specific IP block > > amdgpu_ras_block_late_init will be invoked in IP specific ras_late_init call as a > common helper for all the IP blocks. > > However, when amdgpu_ras_block_late_init call > amdgpu_ras_query_error_count to query ras error counters, > amdgpu_ras_query_error_count queries all the IP blocks that support ras query > interface. > > This results to wrong error counters cached in software copies when there are > ras errors detected at time zero or warm reset procedure. i.e., in > sdma_ras_late_init phase, it counts on sdma/mmhub errors, while, in > mmhub_ras_late_init phase, it still counts on sdma/mmhub errors. > > The change updates amdgpu_ras_query_error_count interface to allow query > specific ip error counter. > It introduces a new input parameter: query_info. if query_info is NULL, it means > query all the IP blocks, otherwise, only query the ip block specified by query_info. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 89 +++++++++++++++++++------ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +- > 2 files changed, 71 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 35b9f2ed2838..7fed63dc09bf 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1130,11 +1130,54 @@ int amdgpu_ras_error_inject(struct amdgpu_device > *adev, } > > /** > - * amdgpu_ras_query_error_count -- Get error counts of all IPs > + * amdgpu_ras_query_error_count_helper -- Get error counter for > +specific IP > + * @adev: pointer to AMD GPU device > + * @ce_count: pointer to an integer to be set to the count of correctible errors. > + * @ue_count: pointer to an integer to be set to the count of uncorrectible > errors. > + * @query_info: pointer to ras_query_if > + * > + * Return 0 for query success or do nothing, otherwise return an error > + * on failures > + */ > +static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, > + unsigned long *ce_count, > + unsigned long *ue_count, > + struct ras_query_if *query_info) { > + int ret; > + > + if (!query_info) > + /* do nothing if query_info is not specified */ > + return 0; > + > + ret = amdgpu_ras_query_error_status(adev, query_info); > + if (ret) > + return ret; > + > + *ce_count += query_info->ce_count; > + *ue_count += query_info->ue_count; > + > + /* some hardware/IP supports read to clear > + * no need to explictly reset the err status after the query call */ > + if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && > + adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { > + if (amdgpu_ras_reset_error_status(adev, query_info- > >head.block)) > + dev_warn(adev->dev, > + "Failed to reset error counter and error > status\n"); > + } > + > + return 0; > +} > + > +/** > + * amdgpu_ras_query_error_count -- Get error counts of all IPs or > +specific IP > * @adev: pointer to AMD GPU device > * @ce_count: pointer to an integer to be set to the count of correctible errors. > * @ue_count: pointer to an integer to be set to the count of uncorrectible > * errors. > + * @query_info: pointer to ras_query_if if the query request is only > + for > + * specific ip block; if info is NULL, then the qurey request is for > + * all the ip blocks that support query ras error counters/status > * > * If set, @ce_count or @ue_count, count and return the corresponding > * error counts in those integer pointers. Return 0 if the device @@ -1142,11 > +1185,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, > */ > int amdgpu_ras_query_error_count(struct amdgpu_device *adev, > unsigned long *ce_count, > - unsigned long *ue_count) > + unsigned long *ue_count, > + struct ras_query_if *query_info) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_manager *obj; > unsigned long ce, ue; > + int ret; > > if (!adev->ras_enabled || !con) > return -EOPNOTSUPP; > @@ -1158,26 +1203,23 @@ int amdgpu_ras_query_error_count(struct > amdgpu_device *adev, > > ce = 0; > ue = 0; > - list_for_each_entry(obj, &con->head, node) { > - struct ras_query_if info = { > - .head = obj->head, > - }; > - int res; > - > - res = amdgpu_ras_query_error_status(adev, &info); > - if (res) > - return res; > + if (!query_info) { > + /* query all the ip blocks that support ras query interface */ > + list_for_each_entry(obj, &con->head, node) { > + struct ras_query_if info = { > + .head = obj->head, > + }; > > - if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) > && > - adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { > - if (amdgpu_ras_reset_error_status(adev, > info.head.block)) > - dev_warn(adev->dev, "Failed to reset error > counter and error status"); > + ret = amdgpu_ras_query_error_count_helper(adev, &ce, > &ue, &info); > } > - > - ce += info.ce_count; > - ue += info.ue_count; > + } else { > + /* query specific ip block */ > + ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, > +query_info); > } > > + if (ret) > + return ret; > + > if (ce_count) > *ce_count = ce; > > @@ -2408,7 +2450,7 @@ static void amdgpu_ras_counte_dw(struct > work_struct *work) > > /* Cache new values. > */ > - if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) > { > + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) > == > +0) { > atomic_set(&con->ras_ce_count, ce_count); > atomic_set(&con->ras_ue_count, ue_count); > } > @@ -2589,6 +2631,7 @@ int amdgpu_ras_block_late_init(struct > amdgpu_device *adev, { > struct amdgpu_ras_block_object *ras_obj = NULL; > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + struct ras_query_if *query_info; > unsigned long ue_count, ce_count; > int r; > > @@ -2630,11 +2673,17 @@ int amdgpu_ras_block_late_init(struct > amdgpu_device *adev, > > /* Those are the cached values at init. > */ > - if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) > { > + query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL); > + if (!query_info) > + return -ENOMEM; > + memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); > + > + if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, > +query_info) == 0) { > atomic_set(&con->ras_ce_count, ce_count); > atomic_set(&con->ras_ue_count, ue_count); > } > > + kfree(query_info); > return 0; > > interrupt: > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index bf5a95104ec1..f2ad999993f6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -540,7 +540,8 @@ void amdgpu_ras_suspend(struct amdgpu_device > *adev); > > int amdgpu_ras_query_error_count(struct amdgpu_device *adev, > unsigned long *ce_count, > - unsigned long *ue_count); > + unsigned long *ue_count, > + struct ras_query_if *query_info); > > /* error handling functions */ > int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, > -- > 2.17.1