support umc/gfx/sdma ras on guest side Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 23 ++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 9 ++++++--- 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 85412e1a04be..e832c5bceb63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5372,6 +5372,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, r = amdgpu_device_reset_sriov(adev, job ? false : true); if (r) adev->asic_reset_res = r; + + /* Aldebaran support ras in SRIOV, so need resume ras during reset */ + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) + amdgpu_ras_resume(adev); } else { r = amdgpu_do_asic_reset(device_list_handle, &reset_context); if (r && r == -EAGAIN) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index 31e07dfc874b..12a1f2389714 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@ -202,6 +202,10 @@ irqreturn_t amdgpu_irq_handler(int irq, void *arg) if (ret == IRQ_HANDLED) pm_runtime_mark_last_busy(dev->dev); + /* Fatal error events are handled on host side */ + if (amdgpu_sriov_vf(adev)) + return ret; + /* For the hardware that cannot enable bif ring for both ras_controller_irq * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status * register to check whether the interrupt is triggered or not, and properly diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2b80a3037481..930fa3837ef9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -721,7 +721,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, /* Do not enable if it is not allowed. */ WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); - if (!amdgpu_ras_intr_triggered()) { + /* Enable ras feature operator handle on host side */ + if (!amdgpu_sriov_vf(adev) && + !amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", @@ -2181,10 +2183,14 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) { adev->ras_hw_enabled = adev->ras_enabled = 0; - if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || + if (!adev->is_atom_fw || !amdgpu_ras_asic_supported(adev)) return; + if (!(amdgpu_sriov_vf(adev) && + (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))) + return; + if (!adev->gmc.xgmi.connected_to_cpu) { if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { dev_info(adev->dev, "MEM ECC is active.\n"); @@ -2196,8 +2202,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { dev_info(adev->dev, "SRAM ECC is active.\n"); - adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); + if (!amdgpu_sriov_vf(adev)) + adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + else + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | + 1 << AMDGPU_RAS_BLOCK__SDMA | + 1 << AMDGPU_RAS_BLOCK__GFX); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); } @@ -2532,6 +2543,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) struct amdgpu_ras_block_object *obj; int r; + /* Guest side doesn't need init ras feature */ + if (amdgpu_sriov_vf(adev)) + return 0; + list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { if (!node->ras_obj) { dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 3b5c43575aa3..72bfac9bf9d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -123,6 +123,10 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + + if (amdgpu_sriov_vf(adev)) + return AMDGPU_RAS_SUCCESS; + amdgpu_ras_reset_gpu(adev); return AMDGPU_RAS_SUCCESS; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index 2c6070b90dcf..165cdc2d7f0b 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -75,9 +75,12 @@ static int psp_v13_0_init_microcode(struct psp_context *psp) err = psp_init_sos_microcode(psp, chip_name); if (err) return err; - err = psp_init_ta_microcode(&adev->psp, chip_name); - if (err) - return err; + /* It's not necessary to load ras ta on Guest side */ + if (!amdgpu_sriov_vf(adev)) { + err = psp_init_ta_microcode(&adev->psp, chip_name); + if (err) + return err; + } break; case IP_VERSION(13, 0, 1): case IP_VERSION(13, 0, 3): -- 2.17.1