[AMD Official Use Only - AMD Internal Distribution Only] Comment inline. Thanks, Feifei -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Lijo Lazar Sent: Monday, September 2, 2024 3:34 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx> Subject: [PATCH 02/10] drm/amdgpu: Use init level for pending_reset flag Drop pending_reset flag in gmc block. Instead use init level to determine which type of init is preferred - in this case MINIMAL. Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 33 ++++--------------- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 6 ++-- .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 3 +- 6 files changed, 13 insertions(+), 33 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 4fb09c4fbf22..db5046e8b10d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1691,7 +1691,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) } /* Don't post if we need to reset whole hive on init */ - if (adev->gmc.xgmi.pending_reset) + if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL) return false; if (adev->has_hw_reset) { @@ -2985,7 +2985,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_ttm_set_buffer_funcs_status(adev, true); /* Don't init kfd if whole hive need to be reset during init */ - if (!adev->gmc.xgmi.pending_reset) { + if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) { kgd2kfd_init_zone_device(adev); amdgpu_amdkfd_device_init(adev); } @@ -3499,14 +3499,9 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) } /* skip unnecessary suspend if we do not initialize them yet */ - if (adev->gmc.xgmi.pending_reset && - !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) { - adev->ip_blocks[i].status.hw = false; + if (!amdgpu_ip_member_of_hwini( + adev, adev->ip_blocks[i].version->type)) continue; - } [Feifei]: AMDGPU_INIT_LEVEL_MINIMAL indicate the minimal necessary blocks which need to do hw_init if SMC need to handle the mode1 reset. Though in newer ASICs it is smc doing the reset, in some old one, it is MP0. Is it more readable if we use naming like AMDGPU_INIT_LEVEL_MINIMAL_SMC to avoid confusion ? /* skip suspend of gfx/mes and psp for S0ix * gfx is in gfxoff state, so on resume it will exit gfxoff just @@ -4320,20 +4315,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { if (adev->gmc.xgmi.num_physical_nodes) { dev_info(adev->dev, "Pending hive reset.\n"); - adev->gmc.xgmi.pending_reset = true; - /* Only need to init necessary block for SMU to handle the reset */ - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.valid) - continue; - if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) { - DRM_DEBUG("IP %s disabled for hw_init.\n", - adev->ip_blocks[i].version->funcs->name); - adev->ip_blocks[i].status.hw = true; - } - } + amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_MINIMAL); } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && !amdgpu_device_has_display_hardware(adev)) { r = psp_gpu_reset(adev); @@ -4441,7 +4423,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* enable clockgating, etc. after ib tests, etc. since some blocks require * explicit gating rather than handling it automatically. */ - if (!adev->gmc.xgmi.pending_reset) { + if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) { r = amdgpu_device_ip_late_init(adev); if (r) { dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n"); @@ -4518,7 +4500,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (px) vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); - if (adev->gmc.xgmi.pending_reset) + if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL) queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); @@ -5490,7 +5472,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, list_for_each_entry(tmp_adev, device_list_handle, reset_list) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { - tmp_adev->gmc.xgmi.pending_reset = false; if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) r = -EALREADY; } else diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 82bde5132dc6..3dece2e69608 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2495,7 +2495,6 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) for (i = 0; i < mgpu_info.num_dgpu; i++) { adev = mgpu_info.gpu_ins[i].adev; flush_work(&adev->xgmi_reset_work); - adev->gmc.xgmi.pending_reset = false; } /* reset function will rebuild the xgmi hive info , clear it now */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 4d951a1baefa..33b2adffd58b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -182,7 +182,6 @@ struct amdgpu_xgmi { bool supported; struct ras_common_if *ras_if; bool connected_to_cpu; - bool pending_reset; struct amdgpu_xgmi_ras *ras; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 61a2f386d9fb..2076f157cb6a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3185,7 +3185,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) * when the GPU is pending on XGMI reset during probe time * (Mostly after second bus reset), skip it now */ - if (adev->gmc.xgmi.pending_reset) + if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL) return 0; ret = amdgpu_ras_eeprom_init(&con->eeprom_control); /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 7de449fae1e3..a7a892512cb9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -860,7 +860,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) if (!adev->gmc.xgmi.supported) return 0; - if (!adev->gmc.xgmi.pending_reset && + if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) && amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { ret = psp_xgmi_initialize(&adev->psp, false, true); if (ret) { @@ -907,7 +907,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) task_barrier_add_task(&hive->tb); - if (!adev->gmc.xgmi.pending_reset && + if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) && amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { /* update node list for other device in the hive */ @@ -985,7 +985,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) } } - if (!ret && !adev->gmc.xgmi.pending_reset) + if (!ret && (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL)) ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); exit_unlock: diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c index 16fcd9dcd202..7225f63c26b4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c @@ -1616,7 +1616,8 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state) break; default: if (!ras || !adev->ras_enabled || - adev->gmc.xgmi.pending_reset) { + (adev->init_lvl->level != + AMDGPU_INIT_LEVEL_MINIMAL)) { [Feifei] Is it a typo? If adev->init_lvl->level taking place of adev->gmc.xgmi.pending_reset, here should be(adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL) if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(11, 0, 2)) { data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL_ARCT); -- 2.25.1