[AMD Official Use Only - General] > -----Original Message----- > From: Stanley.Yang <Stanley.Yang@xxxxxxx> > Sent: Wednesday, May 18, 2022 4:32 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Yang, Stanley <Stanley.Yang@xxxxxxx> > Subject: [PATCH Review 1/1] drm/amdgpu: support ras on SRIOV > > support umc/gfx/sdma ras on guest side > > Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 4 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 23 > ++++++++++++++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 4 ++++ > drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 9 ++++++--- > 5 files changed, 37 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 85412e1a04be..e832c5bceb63 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5372,6 +5372,10 @@ int amdgpu_device_gpu_recover(struct > amdgpu_device *adev, > r = amdgpu_device_reset_sriov(adev, job ? false : true); > if (r) > adev->asic_reset_res = r; > + > + /* Aldebaran support ras in SRIOV, so need resume ras > during reset */ [Tao] support -> supports > + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) > + amdgpu_ras_resume(adev); > } else { > r = amdgpu_do_asic_reset(device_list_handle, > &reset_context); > if (r && r == -EAGAIN) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > index 31e07dfc874b..12a1f2389714 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > @@ -202,6 +202,10 @@ irqreturn_t amdgpu_irq_handler(int irq, void *arg) > if (ret == IRQ_HANDLED) > pm_runtime_mark_last_busy(dev->dev); > > + /* Fatal error events are handled on host side */ > + if (amdgpu_sriov_vf(adev)) > + return ret; [Tao]: can we place the code in amdgpu_ras_interrupt_fatal_error_handler? > + > /* For the hardware that cannot enable bif ring for both > ras_controller_irq > * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status > * register to check whether the interrupt is triggered or not, and > properly diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 2b80a3037481..930fa3837ef9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -721,7 +721,9 @@ int amdgpu_ras_feature_enable(struct > amdgpu_device *adev, > /* Do not enable if it is not allowed. */ > WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, > head)); > > - if (!amdgpu_ras_intr_triggered()) { > + /* Enable ras feature operator handle on host side */ [Tao] "Only Enable ras ..." is better. > + if (!amdgpu_sriov_vf(adev) && > + !amdgpu_ras_intr_triggered()) { [Tao]: it's better to replace the two Tabs with four spaces. > ret = psp_ras_enable_features(&adev->psp, info, enable); > if (ret) { > dev_err(adev->dev, "ras %s %s failed poison:%d > ret:%d\n", @@ -2181,10 +2183,14 @@ static void > amdgpu_ras_check_supported(struct amdgpu_device *adev) { > adev->ras_hw_enabled = adev->ras_enabled = 0; > > - if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || > + if (!adev->is_atom_fw || > !amdgpu_ras_asic_supported(adev)) > return; > > + if (!(amdgpu_sriov_vf(adev) && > + (adev->ip_versions[MP1_HWIP][0] == > IP_VERSION(13, 0, 2)))) [Tao] replace the tabs with spaces. > + return; > + > if (!adev->gmc.xgmi.connected_to_cpu) { > if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { > dev_info(adev->dev, "MEM ECC is active.\n"); @@ - > 2196,8 +2202,13 @@ static void amdgpu_ras_check_supported(struct > amdgpu_device *adev) > > if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { > dev_info(adev->dev, "SRAM ECC is active.\n"); > - adev->ras_hw_enabled |= ~(1 << > AMDGPU_RAS_BLOCK__UMC | > - 1 << > AMDGPU_RAS_BLOCK__DF); > + if (!amdgpu_sriov_vf(adev)) > + adev->ras_hw_enabled |= ~(1 << > AMDGPU_RAS_BLOCK__UMC | > + 1 << > AMDGPU_RAS_BLOCK__DF); > + else > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__PCIE_BIF | > + 1 << > AMDGPU_RAS_BLOCK__SDMA | > + 1 << > AMDGPU_RAS_BLOCK__GFX); > } else { > dev_info(adev->dev, "SRAM ECC is not > presented.\n"); > } > @@ -2532,6 +2543,10 @@ int amdgpu_ras_late_init(struct amdgpu_device > *adev) > struct amdgpu_ras_block_object *obj; > int r; > > + /* Guest side doesn't need init ras feature */ > + if (amdgpu_sriov_vf(adev)) > + return 0; > + > list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { > if (!node->ras_obj) { > dev_warn(adev->dev, "Warning: abnormal ras list > node.\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > index 3b5c43575aa3..72bfac9bf9d4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > @@ -123,6 +123,10 @@ int amdgpu_sdma_process_ras_data_cb(struct > amdgpu_device *adev, > struct amdgpu_iv_entry *entry) > { > kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); > + > + if (amdgpu_sriov_vf(adev)) > + return AMDGPU_RAS_SUCCESS; > + > amdgpu_ras_reset_gpu(adev); > > return AMDGPU_RAS_SUCCESS; > diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c > b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c > index 2c6070b90dcf..165cdc2d7f0b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c > @@ -75,9 +75,12 @@ static int psp_v13_0_init_microcode(struct > psp_context *psp) > err = psp_init_sos_microcode(psp, chip_name); > if (err) > return err; > - err = psp_init_ta_microcode(&adev->psp, chip_name); > - if (err) > - return err; > + /* It's not necessary to load ras ta on Guest side */ > + if (!amdgpu_sriov_vf(adev)) { > + err = psp_init_ta_microcode(&adev->psp, > chip_name); > + if (err) > + return err; > + } > break; > case IP_VERSION(13, 0, 1): > case IP_VERSION(13, 0, 3): > -- > 2.17.1