RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure

"Zhou1, Tao" <Tao.Zhou1@xxxxxxx> · Mon, 2 Sep 2019 03:14:12 +0000

> -----Original Message-----
> From: Chen, Guchun <Guchun.Chen@xxxxxxx>
> Sent: 2019年9月2日 10:13
> To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx;
> Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>; Li, Dennis
> <Dennis.Li@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>
> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> Subject: RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table
> record structure
> 
> 
> 
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao
> Zhou
> Sent: Friday, August 30, 2019 8:25 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Grodzovsky, Andrey
> <Andrey.Grodzovsky@xxxxxxx>; Chen, Guchun <Guchun.Chen@xxxxxxx>;
> Li, Dennis <Dennis.Li@xxxxxxx>; Zhang, Hawking
> <Hawking.Zhang@xxxxxxx>
> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table
> record structure
> 
> change bps type from retired page to eeprom table record, prepare for
> saving error records to eeprom
> 
> Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++-------
> --  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
>  2 files changed, 43 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2ca3997d4b3a..24663ec41248 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct
> amdgpu_device *adev,
> 
>  	for (; i < data->count; i++) {
>  		(*bps)[i] = (struct ras_badpage){
> -			.bp = data->bps[i].bp,
> +			.bp = data->bps[i].retired_page,
>  			.size = AMDGPU_GPU_PAGE_SIZE,
>  			.flags = 0,
>  		};
> 
>  		if (data->last_reserved <= i)
>  			(*bps)[i].flags = 1;
> -		else if (data->bps[i].bo == NULL)
> +		else if (data->bps_bo[i] == NULL)
>  			(*bps)[i].flags = 2;
>  	}
> 
> @@ -1288,30 +1288,40 @@ static int
> amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  {
>  	unsigned int old_space = data->count + data->space_left;
>  	unsigned int new_space = old_space + pages;
> -	unsigned int align_space = ALIGN(new_space, 1024);
> -	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
> -
> -	if (!tmp)
> +	unsigned int align_space = ALIGN(new_space, 512);
> [Guchun]Any special reason to change alignment from 512 to 1024?

[Tao] The old "data->bps" is 16 byte and new " struct eeprom_table_record bps" is 31 bytes on 64bit system, I'd like to lower the pressure on memory system. The value can be adjusted according to feedback in the future.

> 
> +	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
> +	struct amdgpu_bo **bps_bo =
> +			kmalloc(align_space * sizeof(*data->bps_bo),
> GFP_KERNEL);
> +
> +	if (!bps || !bps_bo) {
> +		kfree(bps);
> +		kfree(bps_bo);
>  		return -ENOMEM;
> +	}
> 
>  	if (data->bps) {
> -		memcpy(tmp, data->bps,
> +		memcpy(bps, data->bps,
>  				data->count * sizeof(*data->bps));
>  		kfree(data->bps);
>  	}
> +	if (data->bps_bo) {
> +		memcpy(bps_bo, data->bps_bo,
> +				data->count * sizeof(*data->bps_bo));
> +		kfree(data->bps_bo);
> +	}
> 
> -	data->bps = tmp;
> +	data->bps = bps;
> +	data->bps_bo = bps_bo;
>  	data->space_left += align_space - old_space;
>  	return 0;
>  }
> 
>  /* it deal with vram only. */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> -		unsigned long *bps, int pages)
> +		struct eeprom_table_record *bps, int pages)
>  {
>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>  	struct ras_err_handler_data *data;
> -	int i = pages;
>  	int ret = 0;
> 
>  	if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10
> +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>  			goto out;
>  		}
> 
> -	while (i--)
> -		data->bps[data->count++].bp = bps[i];
> -
> +	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
> +	data->count += pages;
>  	data->space_left -= pages;
> +
>  out:
>  	mutex_unlock(&con->recovery_lock);
> 
> @@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct
> amdgpu_device *adev)
>  		goto out;
>  	/* reserve vram at driver post stage. */
>  	for (i = data->last_reserved; i < data->count; i++) {
> -		bp = data->bps[i].bp;
> +		bp = data->bps[i].retired_page;
> 
>  		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
>  					PAGE_SIZE, &bo))
>  			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n",
> bp);
> 
> -		data->bps[i].bo = bo;
> +		data->bps_bo[i] = bo;
>  		data->last_reserved = i + 1;
>  	}
>  out:
> @@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct
> amdgpu_device *adev)
>  		goto out;
> 
>  	for (i = data->last_reserved - 1; i >= 0; i--) {
> -		bo = data->bps[i].bo;
> +		bo = data->bps_bo[i];
> 
>  		amdgpu_ras_release_vram(adev, &bo);
> 
> -		data->bps[i].bo = bo;
> +		data->bps_bo[i] = bo;
>  		data->last_reserved = i;
>  	}
>  out:
> @@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct
> amdgpu_device *adev)
>  	return 0;
>  }
> 
> +/*
> + * read error record array in eeprom and reserve enough space for
> + * storing new bad pages
> + */
>  static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)  {
> -	/* TODO
> -	 * read the array to eeprom when SMU disabled.
> -	 */
> -	return 0;
> +	struct eeprom_table_record *bps = NULL;
> +	int ret;
> +
> +	ret = amdgpu_ras_add_bad_pages(adev, bps,
> +				adev->umc.max_ras_err_cnt_per_query);
> +
> +	return ret;
>  }
> 
>  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 66b71525446e..b6bac873c588 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -351,11 +351,10 @@ struct ras_err_data {  };
> 
>  struct ras_err_handler_data {
> -	/* point to bad pages array */
> -	struct {
> -		unsigned long bp;
> -		struct amdgpu_bo *bo;
> -	} *bps;
> +	/* point to bad page records array */
> +	struct eeprom_table_record *bps;
> +	/* point to reserved bo array */
> +	struct amdgpu_bo **bps_bo;
>  	/* the count of entries */
>  	int count;
>  	/* the space can place new entries */
> @@ -492,7 +491,7 @@ unsigned long
> amdgpu_ras_query_error_count(struct amdgpu_device *adev,
> 
>  /* error handling functions */
>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
> -		unsigned long *bps, int pages);
> +		struct eeprom_table_record *bps, int pages);
> 
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
> 
> --
> 2.17.1
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx