No objection from me for this patch. But I was really shocked at first glance to the subject and thought how amdgpu driver survive with this bug in bare-metal case... It was then proved to be this is SRIOV specific bug because psp was initialized ahead of ih in sriov use case. The status.hw fix in suspend/resume call stack looks reasonable to me. Patch is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Regards, Hawking -----Original Message----- From: Liu, Monk <Monk.Liu@xxxxxxx> Sent: 2019年8月1日 11:43 To: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> Cc: Deng, Emily <Emily.Deng@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Subject: RE: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3) If no objection I would submit those three patches, thanks _____________________________________ Monk Liu|GPU Virtualization Team |AMD -----Original Message----- From: Deng, Emily <Emily.Deng@xxxxxxx> Sent: Wednesday, July 31, 2019 5:04 PM To: Liu, Monk <Monk.Liu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Liu, Monk <Monk.Liu@xxxxxxx> Subject: RE: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3) All looks good to me. Reviewed-by: Emily Deng <Emily.Deng@xxxxxxx>. >-----Original Message----- >From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Monk >Liu >Sent: Wednesday, July 31, 2019 4:54 PM >To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx >Cc: Liu, Monk <Monk.Liu@xxxxxxx> >Subject: [PATCH] drm/amdgpu: fix double ucode load by PSP(v3) > >previously the ucode loading of PSP was repreated, one executed in >phase_1 init/re-init/resume and the other in fw_loading routine > >Avoid this double loading by clearing ip_blocks.status.hw in suspend or >reset prior to the FW loading and any block's hw_init/resume > >v2: >still do the smu fw loading since it is needed by bare-metal > >v3: >drop the change in reinit_early_sriov, just clear all block's status.hw >in the head place and set the status.hw after hw_init done is enough > >Signed-off-by: Monk Liu <Monk.Liu@xxxxxxx> >--- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 59 >+++++++++++++++++++----------- > 1 file changed, 38 insertions(+), 21 deletions(-) > >diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >index 6cb358c..30436ba 100644 >--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >@@ -1673,28 +1673,34 @@ static int amdgpu_device_fw_loading(struct >amdgpu_device *adev) > > if (adev->asic_type >= CHIP_VEGA10) { > for (i = 0; i < adev->num_ip_blocks; i++) { >- if (adev->ip_blocks[i].version->type == >AMD_IP_BLOCK_TYPE_PSP) { >- if (adev->in_gpu_reset || adev->in_suspend) { >- if (amdgpu_sriov_vf(adev) && adev- >>in_gpu_reset) >- break; /* sriov gpu reset, psp >need to do hw_init before IH because of hw limit */ >- r = adev->ip_blocks[i].version->funcs- >>resume(adev); >- if (r) { >- DRM_ERROR("resume of IP >block <%s> failed %d\n", >+ if (adev->ip_blocks[i].version->type != >AMD_IP_BLOCK_TYPE_PSP) >+ continue; >+ >+ /* no need to do the fw loading again if already >done*/ >+ if (adev->ip_blocks[i].status.hw == true) >+ break; >+ >+ if (adev->in_gpu_reset || adev->in_suspend) { >+ r = adev->ip_blocks[i].version->funcs- >>resume(adev); >+ if (r) { >+ DRM_ERROR("resume of IP block <%s> >failed %d\n", > adev- >>ip_blocks[i].version->funcs->name, r); >- return r; >- } >- } else { >- r = adev->ip_blocks[i].version->funcs- >>hw_init(adev); >- if (r) { >- DRM_ERROR("hw_init of IP >block <%s> failed %d\n", >- adev->ip_blocks[i].version- >>funcs->name, r); >- return r; >- } >+ return r; >+ } >+ } else { >+ r = adev->ip_blocks[i].version->funcs- >>hw_init(adev); >+ if (r) { >+ DRM_ERROR("hw_init of IP block <%s> >failed %d\n", >+ adev- >>ip_blocks[i].version->funcs->name, r); >+ return r; > } >- adev->ip_blocks[i].status.hw = true; > } >+ >+ adev->ip_blocks[i].status.hw = true; >+ break; > } > } >+ > r = amdgpu_pm_load_smu_firmware(adev, &smu_version); > > return r; >@@ -2136,7 +2142,9 @@ static int >amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) > if (r) { > DRM_ERROR("suspend of IP block <%s> failed %d\n", > adev->ip_blocks[i].version->funcs- >>name, r); >+ return r; > } >+ adev->ip_blocks[i].status.hw = false; > } > } > >@@ -2176,14 +2184,16 @@ static int >amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) > if (is_support_sw_smu(adev)) { > /* todo */ > } else if (adev->powerplay.pp_funcs && >- adev->powerplay.pp_funcs->set_mp1_state) >{ >+ adev->powerplay.pp_funcs- >>set_mp1_state) { > r = adev->powerplay.pp_funcs- >>set_mp1_state( > adev->powerplay.pp_handle, > adev->mp1_state); > if (r) { > DRM_ERROR("SMC failed to set mp1 >state %d, %d\n", > adev->mp1_state, r); >+ return r; > } >+ adev->ip_blocks[i].status.hw = false; > } > } > } >@@ -2238,6 +2248,7 @@ static int >amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) > for (j = 0; j < adev->num_ip_blocks; j++) { > block = &adev->ip_blocks[j]; > >+ block->status.hw = false; > if (block->version->type != ip_order[i] || > !block->status.valid) > continue; >@@ -2246,6 +2257,7 @@ static int >amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) > DRM_INFO("RE-INIT-early: %s %s\n", block->version- >>funcs->name, r?"failed":"succeeded"); > if (r) > return r; >+ block->status.hw = true; > } > } > >@@ -2273,13 +2285,15 @@ static int >amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) > block = &adev->ip_blocks[j]; > > if (block->version->type != ip_order[i] || >- !block->status.valid) >+ !block->status.valid || >+ block->status.hw) > continue; > > r = block->version->funcs->hw_init(adev); > DRM_INFO("RE-INIT-late: %s %s\n", block->version- >>funcs->name, r?"failed":"succeeded"); > if (r) > return r; >+ block->status.hw = true; > } > } > >@@ -2303,17 +2317,19 @@ static int >amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) > int i, r; > > for (i = 0; i < adev->num_ip_blocks; i++) { >- if (!adev->ip_blocks[i].status.valid) >+ if (!adev->ip_blocks[i].status.valid || adev- >>ip_blocks[i].status.hw) > continue; > if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || > adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || > adev->ip_blocks[i].version->type == >AMD_IP_BLOCK_TYPE_IH) { >+ > r = adev->ip_blocks[i].version->funcs->resume(adev); > if (r) { > DRM_ERROR("resume of IP block <%s> failed %d\n", > adev->ip_blocks[i].version->funcs- >>name, r); > return r; > } >+ adev->ip_blocks[i].status.hw = true; > } > } > >@@ -2338,7 +2354,7 @@ static int amdgpu_device_ip_resume_phase2(struct >amdgpu_device *adev) > int i, r; > > for (i = 0; i < adev->num_ip_blocks; i++) { >- if (!adev->ip_blocks[i].status.valid) >+ if (!adev->ip_blocks[i].status.valid || adev- >>ip_blocks[i].status.hw) > continue; > if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || > adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || @@ >-2351,6 +2367,7 @@ static int amdgpu_device_ip_resume_phase2(struct >amdgpu_device *adev) > adev->ip_blocks[i].version->funcs->name, r); > return r; > } >+ adev->ip_blocks[i].status.hw = true; > } > > return 0; >-- >2.7.4 > >_______________________________________________ >amd-gfx mailing list >amd-gfx@xxxxxxxxxxxxxxxxxxxxx >https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx