[AMD Public Use] V2 looks good to me expect for the following that can be improved + if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { The check of reboot could be centralized to the new function amdgpu_ras_need_emergency_restart you added in amdgpu_ras.c. And it would be better you can make amdgpu_ras_need_emergency_restart related as a separated patch so that it helps to isolate ras recovery related issue from general gpu reset issue. Regards, Hawking -----Original Message----- From: Sheng, Wenhui <Wenhui.Sheng@xxxxxxx> Sent: Friday, July 10, 2020 22:17 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Sheng, Wenhui <Wenhui.Sheng@xxxxxxx>; Gao, Likun <Likun.Gao@xxxxxxx> Subject: [PATCH 2/3] drm/amdgpu: enable mode1 reset For sienna cichlid, add mode1 reset path for sGPU. And fix some corner cases about mode1 mode reuse. v2: hiding MP0/MP1 mode1 reset under AMD_RESET_METHOD_MODE1 Signed-off-by: Likun Gao <Likun.Gao@xxxxxxx> Signed-off-by: Wenhui Sheng <Wenhui.Sheng@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 +++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + drivers/gpu/drm/amd/amdgpu/nv.c | 19 +++++++++++------- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 +- 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index fec6cf3f0c8a..6ce6e6bb8b50 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4274,18 +4274,19 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; - bool in_ras_intr = amdgpu_ras_intr_triggered(); - bool use_baco = - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? - true : false; + bool need_emergency_restart = false; bool audio_suspended = false; + /** + * Special case: RAS triggered and full reset isn't supported + */ + need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); + /* * Flush RAM to disk so that after reboot * the user can read log and see why the system rebooted. */ - if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { - + if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { DRM_WARN("Emergency reboot."); ksys_sync_helper(); @@ -4293,7 +4294,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } dev_info(adev->dev, "GPU %s begin!\n", - (in_ras_intr && !use_baco) ? "jobs stop":"reset"); + need_emergency_restart ? "jobs stop":"reset"); /* * Here we trylock to avoid chain of resets executing from @@ -4365,7 +4366,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_fbdev_set_suspend(tmp_adev, 1); /* disable ras on ALL IPs */ - if (!(in_ras_intr && !use_baco) && + if (!need_emergency_restart && amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); @@ -4377,12 +4378,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, drm_sched_stop(&ring->sched, job ? &job->base : NULL); - if (in_ras_intr && !use_baco) + if (need_emergency_restart) amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } - if (in_ras_intr && !use_baco) + if (need_emergency_restart) goto skip_sched_resume; /* @@ -4459,7 +4460,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, skip_sched_resume: list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /*unlock kfd: SRIOV would do it separately */ - if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) + if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); if (audio_suspended) amdgpu_device_resume_display_audio(tmp_adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3a3fa8567c94..6f06e1214622 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2135,3 +2135,14 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) amdgpu_ras_reset_gpu(adev); } } + +bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) { + if (adev->asic_type == CHIP_VEGA20 && + adev->pm.fw_version <= 0x283400) { + return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && + amdgpu_ras_intr_triggered(); + } + + return false; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index e7df5d8429f8..b2667342cf67 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -633,4 +633,5 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready); +bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 356849136d1d..9f1240bd0310 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -265,17 +265,21 @@ static int nv_asic_mode1_reset(struct amdgpu_device *adev) amdgpu_atombios_scratch_regs_engine_hung(adev, true); - dev_info(adev->dev, "GPU mode1 reset\n"); - /* disable BM */ pci_clear_master(adev->pdev); pci_save_state(adev->pdev); - ret = psp_gpu_reset(adev); + if (amdgpu_dpm_is_mode1_reset_supported(adev)) { + dev_info(adev->dev, "GPU smu mode1 reset\n"); + ret = amdgpu_dpm_mode1_reset(adev); + } else { + dev_info(adev->dev, "GPU psp mode1 reset\n"); + ret = psp_gpu_reset(adev); + } + if (ret) dev_err(adev->dev, "GPU mode1 reset failed\n"); - pci_restore_state(adev->pdev); /* wait for asic to come out of reset */ @@ -307,7 +311,7 @@ nv_asic_reset_method(struct amdgpu_device *adev) { struct smu_context *smu = &adev->smu; - if (!amdgpu_sriov_vf(adev) && smu_baco_is_support(smu)) + if (smu_baco_is_support(smu)) return AMD_RESET_METHOD_BACO; else return AMD_RESET_METHOD_MODE1; @@ -319,15 +323,16 @@ static int nv_asic_reset(struct amdgpu_device *adev) struct smu_context *smu = &adev->smu; if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { + dev_info(adev->dev, "GPU BACO reset\n"); + ret = smu_baco_enter(smu); if (ret) return ret; ret = smu_baco_exit(smu); if (ret) return ret; - } else { + } else ret = nv_asic_mode1_reset(adev); - } return ret; } diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index 4f7d064e16e4..014815bcae93 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -2039,7 +2039,7 @@ static bool navi10_is_baco_supported(struct smu_context *smu) struct amdgpu_device *adev = smu->adev; uint32_t val; - if (!smu_v11_0_baco_is_support(smu)) + if (amdgpu_sriov_vf(adev) || (!smu_v11_0_baco_is_support(smu))) return false; val = RREG32_SOC15(NBIO, 0, mmRCC_BIF_STRAP0); -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx