> -----Original Message----- > From: Stanley.Yang <Stanley.Yang@xxxxxxx> > Sent: Wednesday, May 18, 2022 11:44 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> > Subject: [PATCH Review 1/1] drm/amdgpu: support ras on SRIOV > > support umc/gfx/sdma ras on guest side > > Changed from V1: > move sriov judgment in amdgpu_ras_interrupt_fatal_error_handler > > Change-Id: Ic7dda45d8f8cf2d5f1abc7705abc153d558da8a1 > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 42 ++++++++++++++++------ > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 4 +++ > drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 9 +++-- > 4 files changed, 45 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index b583026dc893..ba7990d0dc0e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5218,6 +5218,10 @@ int amdgpu_device_gpu_recover_imp(struct > amdgpu_device *adev, > r = amdgpu_device_reset_sriov(adev, job ? false : true); > if (r) > adev->asic_reset_res = r; > + > + /* Aldebaran supports ras in SRIOV, so need resume ras during > reset */ > + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) > + amdgpu_ras_resume(adev); > } else { > r = amdgpu_do_asic_reset(device_list_handle, &reset_context); > if (r && r == -EAGAIN) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index a653cf3b3d13..2b28210c4994 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -726,7 +726,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device > *adev, > /* Do not enable if it is not allowed. */ > WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); > > - if (!amdgpu_ras_intr_triggered()) { > + /* Only enable ras feature operation handle on host side */ > + if (!amdgpu_sriov_vf(adev) && > + !amdgpu_ras_intr_triggered()) { > ret = psp_ras_enable_features(&adev->psp, info, enable); > if (ret) { > dev_err(adev->dev, "ras %s %s failed poison:%d > ret:%d\n", @@ -1523,6 +1525,10 @@ static int amdgpu_ras_fs_fini(struct > amdgpu_device *adev) > */ > void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) { > + /* Fatal error events are handled on host side */ > + if (amdgpu_sriov_vf(adev)) > + return; > + > if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) > return; [Tao] The two conditions above can be merged, other than that the patch is: Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > > @@ -2270,10 +2276,14 @@ static void amdgpu_ras_check_supported(struct > amdgpu_device *adev) { > adev->ras_hw_enabled = adev->ras_enabled = 0; > > - if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || > + if (!adev->is_atom_fw || > !amdgpu_ras_asic_supported(adev)) > return; > > + if (!(amdgpu_sriov_vf(adev) && > + (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))) > + return; > + > if (!adev->gmc.xgmi.connected_to_cpu) { > if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { > dev_info(adev->dev, "MEM ECC is active.\n"); @@ - > 2285,15 +2295,21 @@ static void amdgpu_ras_check_supported(struct > amdgpu_device *adev) > > if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { > dev_info(adev->dev, "SRAM ECC is active.\n"); > - adev->ras_hw_enabled |= ~(1 << > AMDGPU_RAS_BLOCK__UMC | > - 1 << > AMDGPU_RAS_BLOCK__DF); > - > - if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, > 6, 0)) > - adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__VCN | > - 1 << > AMDGPU_RAS_BLOCK__JPEG); > - else > - adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__VCN | > - 1 << > AMDGPU_RAS_BLOCK__JPEG); > + if (!amdgpu_sriov_vf(adev)) { > + adev->ras_hw_enabled |= ~(1 << > AMDGPU_RAS_BLOCK__UMC | > + 1 << > AMDGPU_RAS_BLOCK__DF); > + > + if (adev->ip_versions[VCN_HWIP][0] == > IP_VERSION(2, 6, 0)) > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__VCN | > + 1 << > AMDGPU_RAS_BLOCK__JPEG); > + else > + adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__VCN | > + 1 << > AMDGPU_RAS_BLOCK__JPEG); > + } else { > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__PCIE_BIF | > + 1 << > AMDGPU_RAS_BLOCK__SDMA | > + 1 << > AMDGPU_RAS_BLOCK__GFX); > + } > } else { > dev_info(adev->dev, "SRAM ECC is not presented.\n"); > } > @@ -2637,6 +2653,10 @@ int amdgpu_ras_late_init(struct amdgpu_device > *adev) > struct amdgpu_ras_block_object *obj; > int r; > > + /* Guest side doesn't need init ras feature */ > + if (amdgpu_sriov_vf(adev)) > + return 0; > + > list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { > if (!node->ras_obj) { > dev_warn(adev->dev, "Warning: abnormal ras list > node.\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > index 8e221a1ba937..42c1f050542f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > @@ -124,6 +124,10 @@ int amdgpu_sdma_process_ras_data_cb(struct > amdgpu_device *adev, > struct amdgpu_iv_entry *entry) > { > kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > + > + if (amdgpu_sriov_vf(adev)) > + return AMDGPU_RAS_SUCCESS; > + > amdgpu_ras_reset_gpu(adev); > > return AMDGPU_RAS_SUCCESS; > diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c > b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c > index d6d79e97def9..18014ed0e853 100644 > --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c > @@ -85,9 +85,12 @@ static int psp_v13_0_init_microcode(struct psp_context > *psp) > err = psp_init_sos_microcode(psp, chip_name); > if (err) > return err; > - err = psp_init_ta_microcode(&adev->psp, chip_name); > - if (err) > - return err; > + /* It's not necessary to load ras ta on Guest side */ > + if (!amdgpu_sriov_vf(adev)) { > + err = psp_init_ta_microcode(&adev->psp, chip_name); > + if (err) > + return err; > + } > break; > case IP_VERSION(13, 0, 1): > case IP_VERSION(13, 0, 3): > -- > 2.17.1