RE: [PATCH] drm/amdgpu: allow query error counters for specific IP block

"Zhou1, Tao" <Tao.Zhou1@xxxxxxx> · Wed, 4 Jan 2023 03:15:31 +0000



[AMD Official Use Only - General]

Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx>

> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
> Sent: Wednesday, January 4, 2023 12:25 AM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Yang,
> Stanley <Stanley.Yang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Chai,
> Thomas <YiPeng.Chai@xxxxxxx>
> Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
> Subject: [PATCH] drm/amdgpu: allow query error counters for specific IP block
> 
> amdgpu_ras_block_late_init will be invoked in IP specific ras_late_init call as a
> common helper for all the IP blocks.
> 
> However, when amdgpu_ras_block_late_init call
> amdgpu_ras_query_error_count to query ras error counters,
> amdgpu_ras_query_error_count queries all the IP blocks that support ras query
> interface.
> 
> This results to wrong error counters cached in software copies when there are
> ras errors detected at time zero or warm reset procedure. i.e., in
> sdma_ras_late_init phase, it counts on sdma/mmhub errors, while, in
> mmhub_ras_late_init phase, it still counts on sdma/mmhub errors.
> 
> The change updates amdgpu_ras_query_error_count interface to allow query
> specific ip error counter.
> It introduces a new input parameter: query_info. if query_info is NULL,  it means
> query all the IP blocks, otherwise, only query the ip block specified by query_info.
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx>
> Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 89 +++++++++++++++++++------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 +-
>  2 files changed, 71 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 35b9f2ed2838..7fed63dc09bf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1130,11 +1130,54 @@ int amdgpu_ras_error_inject(struct amdgpu_device
> *adev,  }
> 
>  /**
> - * amdgpu_ras_query_error_count -- Get error counts of all IPs
> + * amdgpu_ras_query_error_count_helper -- Get error counter for
> +specific IP
> + * @adev: pointer to AMD GPU device
> + * @ce_count: pointer to an integer to be set to the count of correctible errors.
> + * @ue_count: pointer to an integer to be set to the count of uncorrectible
> errors.
> + * @query_info: pointer to ras_query_if
> + *
> + * Return 0 for query success or do nothing, otherwise return an error
> + * on failures
> + */
> +static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,
> +					       unsigned long *ce_count,
> +					       unsigned long *ue_count,
> +					       struct ras_query_if *query_info) {
> +	int ret;
> +
> +	if (!query_info)
> +		/* do nothing if query_info is not specified */
> +		return 0;
> +
> +	ret = amdgpu_ras_query_error_status(adev, query_info);
> +	if (ret)
> +		return ret;
> +
> +	*ce_count += query_info->ce_count;
> +	*ue_count += query_info->ue_count;
> +
> +	/* some hardware/IP supports read to clear
> +	 * no need to explictly reset the err status after the query call */
> +	if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
> +	    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
> +		if (amdgpu_ras_reset_error_status(adev, query_info-
> >head.block))
> +			dev_warn(adev->dev,
> +				 "Failed to reset error counter and error
> status\n");
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * amdgpu_ras_query_error_count -- Get error counts of all IPs or
> +specific IP
>   * @adev: pointer to AMD GPU device
>   * @ce_count: pointer to an integer to be set to the count of correctible errors.
>   * @ue_count: pointer to an integer to be set to the count of uncorrectible
>   * errors.
> + * @query_info: pointer to ras_query_if if the query request is only
> + for
> + * specific ip block; if info is NULL, then the qurey request is for
> + * all the ip blocks that support query ras error counters/status
>   *
>   * If set, @ce_count or @ue_count, count and return the corresponding
>   * error counts in those integer pointers. Return 0 if the device @@ -1142,11
> +1185,13 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>   */
>  int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>  				 unsigned long *ce_count,
> -				 unsigned long *ue_count)
> +				 unsigned long *ue_count,
> +				 struct ras_query_if *query_info)
>  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  	struct ras_manager *obj;
>  	unsigned long ce, ue;
> +	int ret;
> 
>  	if (!adev->ras_enabled || !con)
>  		return -EOPNOTSUPP;
> @@ -1158,26 +1203,23 @@ int amdgpu_ras_query_error_count(struct
> amdgpu_device *adev,
> 
>  	ce = 0;
>  	ue = 0;
> -	list_for_each_entry(obj, &con->head, node) {
> -		struct ras_query_if info = {
> -			.head = obj->head,
> -		};
> -		int res;
> -
> -		res = amdgpu_ras_query_error_status(adev, &info);
> -		if (res)
> -			return res;
> +	if (!query_info) {
> +		/* query all the ip blocks that support ras query interface */
> +		list_for_each_entry(obj, &con->head, node) {
> +			struct ras_query_if info = {
> +				.head = obj->head,
> +			};
> 
> -		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2)
> &&
> -		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
> -			if (amdgpu_ras_reset_error_status(adev,
> info.head.block))
> -				dev_warn(adev->dev, "Failed to reset error
> counter and error status");
> +			ret = amdgpu_ras_query_error_count_helper(adev, &ce,
> &ue, &info);
>  		}
> -
> -		ce += info.ce_count;
> -		ue += info.ue_count;
> +	} else {
> +		/* query specific ip block */
> +		ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue,
> +query_info);
>  	}
> 
> +	if (ret)
> +		return ret;
> +
>  	if (ce_count)
>  		*ce_count = ce;
> 
> @@ -2408,7 +2450,7 @@ static void amdgpu_ras_counte_dw(struct
> work_struct *work)
> 
>  	/* Cache new values.
>  	 */
> -	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0)
> {
> +	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL)
> ==
> +0) {
>  		atomic_set(&con->ras_ce_count, ce_count);
>  		atomic_set(&con->ras_ue_count, ue_count);
>  	}
> @@ -2589,6 +2631,7 @@ int amdgpu_ras_block_late_init(struct
> amdgpu_device *adev,  {
>  	struct amdgpu_ras_block_object *ras_obj = NULL;
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> +	struct ras_query_if *query_info;
>  	unsigned long ue_count, ce_count;
>  	int r;
> 
> @@ -2630,11 +2673,17 @@ int amdgpu_ras_block_late_init(struct
> amdgpu_device *adev,
> 
>  	/* Those are the cached values at init.
>  	 */
> -	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0)
> {
> +	query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL);
> +	if (!query_info)
> +		return -ENOMEM;
> +	memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if));
> +
> +	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count,
> +query_info) == 0) {
>  		atomic_set(&con->ras_ce_count, ce_count);
>  		atomic_set(&con->ras_ue_count, ue_count);
>  	}
> 
> +	kfree(query_info);
>  	return 0;
> 
>  interrupt:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index bf5a95104ec1..f2ad999993f6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -540,7 +540,8 @@ void amdgpu_ras_suspend(struct amdgpu_device
> *adev);
> 
>  int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
>  				 unsigned long *ce_count,
> -				 unsigned long *ue_count);
> +				 unsigned long *ue_count,
> +				 struct ras_query_if *query_info);
> 
>  /* error handling functions */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> --
> 2.17.1