RE: [PATCH Review 1/1] drm/amdgpu: support ras on SRIOV

"Zhou1, Tao" <Tao.Zhou1@xxxxxxx> · Wed, 18 May 2022 11:18:54 +0000

[AMD Official Use Only - General]

> -----Original Message-----
> From: Stanley.Yang <Stanley.Yang@xxxxxxx>
> Sent: Wednesday, May 18, 2022 4:32 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking
> <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> Cc: Yang, Stanley <Stanley.Yang@xxxxxxx>
> Subject: [PATCH Review 1/1] drm/amdgpu: support ras on SRIOV
> 
> support umc/gfx/sdma ras on guest side
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c    |  4 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 23
> ++++++++++++++++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   |  4 ++++
>  drivers/gpu/drm/amd/amdgpu/psp_v13_0.c     |  9 ++++++---
>  5 files changed, 37 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 85412e1a04be..e832c5bceb63 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5372,6 +5372,10 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
>  		r = amdgpu_device_reset_sriov(adev, job ? false : true);
>  		if (r)
>  			adev->asic_reset_res = r;
> +
> +		/* Aldebaran support ras in SRIOV, so need resume ras
> during reset */

[Tao] support -> supports

> +		if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
> +			amdgpu_ras_resume(adev);
>  	} else {
>  		r = amdgpu_do_asic_reset(device_list_handle,
> &reset_context);
>  		if (r && r == -EAGAIN)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> index 31e07dfc874b..12a1f2389714 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
> @@ -202,6 +202,10 @@ irqreturn_t amdgpu_irq_handler(int irq, void *arg)
>  	if (ret == IRQ_HANDLED)
>  		pm_runtime_mark_last_busy(dev->dev);
> 
> +	/* Fatal error events are handled on host side */
> +	if (amdgpu_sriov_vf(adev))
> +		return ret;

[Tao]: can we place the code in amdgpu_ras_interrupt_fatal_error_handler?

> +
>  	/* For the hardware that cannot enable bif ring for both
> ras_controller_irq
>           * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
>  	 * register to check whether the interrupt is triggered or not, and
> properly diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2b80a3037481..930fa3837ef9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -721,7 +721,9 @@ int amdgpu_ras_feature_enable(struct
> amdgpu_device *adev,
>  	/* Do not enable if it is not allowed. */
>  	WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev,
> head));
> 
> -	if (!amdgpu_ras_intr_triggered()) {
> +	/* Enable ras feature operator handle on host side */

[Tao] "Only Enable ras ..." is better.

> +	if (!amdgpu_sriov_vf(adev) &&
> +			!amdgpu_ras_intr_triggered()) {

[Tao]: it's better to replace the two Tabs with four spaces.

>  		ret = psp_ras_enable_features(&adev->psp, info, enable);
>  		if (ret) {
>  			dev_err(adev->dev, "ras %s %s failed poison:%d
> ret:%d\n", @@ -2181,10 +2183,14 @@ static void
> amdgpu_ras_check_supported(struct amdgpu_device *adev)  {
>  	adev->ras_hw_enabled = adev->ras_enabled = 0;
> 
> -	if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
> +	if (!adev->is_atom_fw ||
>  	    !amdgpu_ras_asic_supported(adev))
>  		return;
> 
> +	if (!(amdgpu_sriov_vf(adev) &&
> +			(adev->ip_versions[MP1_HWIP][0] ==
> IP_VERSION(13, 0, 2))))

[Tao] replace the tabs with spaces.

> +		return;
> +
>  	if (!adev->gmc.xgmi.connected_to_cpu) {
>  		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
>  			dev_info(adev->dev, "MEM ECC is active.\n"); @@ -
> 2196,8 +2202,13 @@ static void amdgpu_ras_check_supported(struct
> amdgpu_device *adev)
> 
>  		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
>  			dev_info(adev->dev, "SRAM ECC is active.\n");
> -			adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> -						    1 <<
> AMDGPU_RAS_BLOCK__DF);
> +			if (!amdgpu_sriov_vf(adev))
> +				adev->ras_hw_enabled |= ~(1 <<
> AMDGPU_RAS_BLOCK__UMC |
> +							    1 <<
> AMDGPU_RAS_BLOCK__DF);
> +			else
> +				adev->ras_hw_enabled |= (1 <<
> AMDGPU_RAS_BLOCK__PCIE_BIF |
> +								1 <<
> AMDGPU_RAS_BLOCK__SDMA |
> +								1 <<
> AMDGPU_RAS_BLOCK__GFX);
>  		} else {
>  			dev_info(adev->dev, "SRAM ECC is not
> presented.\n");
>  		}
> @@ -2532,6 +2543,10 @@ int amdgpu_ras_late_init(struct amdgpu_device
> *adev)
>  	struct amdgpu_ras_block_object *obj;
>  	int r;
> 
> +	/* Guest side doesn't need init ras feature */
> +	if (amdgpu_sriov_vf(adev))
> +		return 0;
> +
>  	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
>  		if (!node->ras_obj) {
>  			dev_warn(adev->dev, "Warning: abnormal ras list
> node.\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 3b5c43575aa3..72bfac9bf9d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -123,6 +123,10 @@ int amdgpu_sdma_process_ras_data_cb(struct
> amdgpu_device *adev,
>  		struct amdgpu_iv_entry *entry)
>  {
>  	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> +
> +	if (amdgpu_sriov_vf(adev))
> +		return AMDGPU_RAS_SUCCESS;
> +
>  	amdgpu_ras_reset_gpu(adev);
> 
>  	return AMDGPU_RAS_SUCCESS;
> diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> index 2c6070b90dcf..165cdc2d7f0b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
> @@ -75,9 +75,12 @@ static int psp_v13_0_init_microcode(struct
> psp_context *psp)
>  		err = psp_init_sos_microcode(psp, chip_name);
>  		if (err)
>  			return err;
> -		err = psp_init_ta_microcode(&adev->psp, chip_name);
> -		if (err)
> -			return err;
> +		/* It's not necessary to load ras ta on Guest side */
> +		if (!amdgpu_sriov_vf(adev)) {
> +			err = psp_init_ta_microcode(&adev->psp,
> chip_name);
> +			if (err)
> +				return err;
> +		}
>  		break;
>  	case IP_VERSION(13, 0, 1):
>  	case IP_VERSION(13, 0, 3):
> --
> 2.17.1