Adjust removal control flow for smu v13_0_2: During amdgpu uninstallation, when removing the first device, the kernel needs to first send a mode1reset message to all gpu devices. Otherwise, smu initialization will fail the next time amdgpu is installed. V2: 1. Update commit comments. 2. Remove the global variable amdgpu_device_remove_cnt and add a variable to the structure amdgpu_hive_info. 3. Use hive to detect the first removed device instead of a global variable. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 +++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 35 +++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 16 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 + drivers/gpu/drm/amd/pm/amdgpu_pm.c | 6 +++- 7 files changed, 99 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 79bb6fd83094..465295318830 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -997,6 +997,9 @@ struct amdgpu_device { bool in_s4; bool in_s0ix; + /* uninstall */ + bool in_remove; + enum pp_mp1_state mp1_state; struct amdgpu_doorbell_index doorbell_index; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62b26f0e37b0..1402717673f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2999,6 +2999,13 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) DRM_ERROR("suspend of IP block <%s> failed %d\n", adev->ip_blocks[i].version->funcs->name, r); } + + /* If in_remove is true, psp_hw_fini should be executed after + * psp_suspend to free psp shared buffers. + */ + if (adev->in_remove && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) + continue; + adev->ip_blocks[i].status.hw = false; /* handle putting the SMC in the appropriate state */ if(!amdgpu_sriov_vf(adev)){ @@ -4739,6 +4746,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_device *tmp_adev = NULL; bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; + bool gpu_reset_for_dev_remove = 0; /* Try reset handler method first */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, @@ -4758,6 +4766,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); + gpu_reset_for_dev_remove = + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) @@ -4802,6 +4814,16 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, amdgpu_ras_intr_cleared(); } + /* Fixed the problem that BIOS signature errors and psp bootloader + * failure to load kdb on next amdgpu install. + */ + if (gpu_reset_for_dev_remove) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) + amdgpu_device_ip_resume_phase1(tmp_adev); + + goto end; + } + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ @@ -5124,6 +5146,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, bool need_emergency_restart = false; bool audio_suspended = false; int tmp_vram_lost_counter; + bool gpu_reset_for_dev_remove = false; + + gpu_reset_for_dev_remove = + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); /* * Special case: RAS triggered and full reset isn't supported @@ -5159,8 +5186,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ INIT_LIST_HEAD(&device_list); if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { list_add_tail(&tmp_adev->reset_list, &device_list); + if (adev->in_remove) + tmp_adev->in_remove = true; + } if (!list_is_first(&adev->reset_list, &device_list)) list_rotate_to_front(&adev->reset_list, &device_list); device_list_handle = &device_list; @@ -5243,6 +5273,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + if (gpu_reset_for_dev_remove) { + /* Workaroud for ASICs need to disable SMC first */ + amdgpu_device_smu_fini_early(tmp_adev); + } r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -5276,6 +5310,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, adev->asic_reset_res = 0; goto retry; } + + if (!r && gpu_reset_for_dev_remove) + goto recover_end; } skip_hw_reset: @@ -5349,6 +5386,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_device_unset_mp1_state(tmp_adev); } +recover_end: tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 728a0933ea6f..9271f219d8fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2175,6 +2175,19 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, return ret; } +static void amdgpu_device_gpu_reset(struct amdgpu_device *adev) +{ + struct amdgpu_reset_context reset_context; + + memset(&reset_context, 0, sizeof(reset_context)); + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags); + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); +} + static void amdgpu_pci_remove(struct pci_dev *pdev) { @@ -2186,6 +2199,28 @@ amdgpu_pci_remove(struct pci_dev *pdev) pm_runtime_forbid(dev->dev); } + if (adev->asic_type == CHIP_ALDEBARAN) { + bool need_to_reset_gpu = false; + + adev->in_remove = true; + if (adev->gmc.xgmi.num_physical_nodes > 1) { + struct amdgpu_hive_info *hive; + + hive = amdgpu_get_xgmi_hive(adev); + if (hive->device_remove_count == 0) + need_to_reset_gpu = true; + hive->device_remove_count++; + amdgpu_put_xgmi_hive(hive); + } else + need_to_reset_gpu = true; + + /* Workaround for ASICs need to reset SMU. + * Called only when the first device is removed. + */ + if (need_to_reset_gpu) + amdgpu_device_gpu_reset(adev); + } + amdgpu_driver_unload_kms(dev); drm_dev_unplug(dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 28ca0a94b8a5..1f19f9fa4396 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2647,7 +2647,15 @@ static int psp_hw_fini(void *handle) psp_asd_terminate(psp); psp_tmr_terminate(psp); - psp_ring_destroy(psp, PSP_RING_TYPE__KM); + /* If in_remove is true, psp_suspend is called before + * psp_hw_fini. psp ring has been stopped in psp_suspend. + */ + if (adev->in_remove && psp->km_ring.ring_mem) + amdgpu_bo_free_kernel(&adev->firmware.rbuf, + &psp->km_ring.ring_mem_mc_addr, + (void **)&psp->km_ring.ring_mem); + else + psp_ring_destroy(psp, PSP_RING_TYPE__KM); psp_free_shared_bufs(psp); @@ -2715,6 +2723,12 @@ static int psp_suspend(void *handle) } out: + /* If in_remove is true, psp_hw_fini will be called after + * psp_suspend. Psp shared buffer will be freed in psp_hw_fini. + */ + if (adev->in_remove) + return ret; + psp_free_shared_bufs(psp); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index f71b83c42590..dc43fcb93eac 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -31,6 +31,7 @@ enum AMDGPU_RESET_FLAGS { AMDGPU_NEED_FULL_RESET = 0, AMDGPU_SKIP_HW_RESET = 1, AMDGPU_SKIP_MODE2_RESET = 2, + AMDGPU_RESET_FOR_DEVICE_REMOVE = 3, }; struct amdgpu_reset_context { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 552e6fb55aa8..30dcc1681b4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -43,6 +43,7 @@ struct amdgpu_hive_info { } pstate; struct amdgpu_reset_domain *reset_domain; + uint32_t device_remove_count; }; struct amdgpu_pcs_ras_field { diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 5e318b3f6c0f..6be90076c9f3 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -3405,7 +3405,11 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev) void amdgpu_pm_sysfs_fini(struct amdgpu_device *adev) { - if (adev->pm.dpm_enabled == 0) + /* If in_remove is true, the check for pm.dpm_enabled + * needs to be skipped, since smu_suspend is called before + * amdgpu_pm_sysfs_fini in the device removal path. + */ + if ((adev->pm.dpm_enabled == 0) && !adev->in_remove) return; if (adev->pm.int_hwmon_dev) -- 2.25.1