On 4/26/2024 9:27 AM, Yunxiang Li wrote: > Some times a hang GPU causes multiple reset sources to schedule resets. > The second source will be able to trigger an unnecessary reset if they > schedule after we call amdgpu_device_stop_pending_resets. > > Move amdgpu_device_stop_pending_resets to after the reset is done. Since > at this point the GPU is supposedly in a good state, any reset scheduled > after this point would be a legitimate reset. > > Remove unnecessary and incorrect checks for amdgpu_in_reset that was > kinda serving this purpose. > > Signed-off-by: Yunxiang Li <Yunxiang.Li@xxxxxxx> Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx> Thanks, Lijo > --- > v2: instead of adding amdgpu_in_reset check, move when we cancel pending > resets > v3: no changes from v2, collect all the patches in one series for easier review > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 ++++++++++--------- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 2 +- > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 2 +- > drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 2 +- > 5 files changed, 14 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 861ccff78af9..8befd10bf007 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5070,8 +5070,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, > retry: > amdgpu_amdkfd_pre_reset(adev); > > - amdgpu_device_stop_pending_resets(adev); > - > if (from_hypervisor) > r = amdgpu_virt_request_full_gpu(adev, true); > else > @@ -5823,13 +5821,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > r, adev_to_drm(tmp_adev)->unique); > tmp_adev->asic_reset_res = r; > } > - > - if (!amdgpu_sriov_vf(tmp_adev)) > - /* > - * Drop all pending non scheduler resets. Scheduler resets > - * were already dropped during drm_sched_stop > - */ > - amdgpu_device_stop_pending_resets(tmp_adev); > } > > /* Actual ASIC resets if needed.*/ > @@ -5851,6 +5842,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > goto retry; > } > > + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { > + /* > + * Drop any pending non scheduler resets queued before reset is done. > + * Any reset scheduled after this point would be valid. Scheduler resets > + * were already dropped during drm_sched_stop and no new ones can come > + * in before drm_sched_start. > + */ > + amdgpu_device_stop_pending_resets(tmp_adev); > + } > + > skip_hw_reset: > > /* Post ASIC reset for all devs .*/ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > index 54ab51a4ada7..c2385178d6b3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > @@ -597,7 +597,7 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) > if (ret) { > adev->virt.vf2pf_update_retry_cnt++; > if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && > - amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) { > + amdgpu_sriov_runtime(adev)) { > amdgpu_ras_set_fed(adev, true); > if (amdgpu_reset_domain_schedule(adev->reset_domain, > &adev->virt.flr_work)) > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index 0c7275bca8f7..c5ba9c4757a8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -319,7 +319,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, > > switch (event) { > case IDH_FLR_NOTIFICATION: > - if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) > + if (amdgpu_sriov_runtime(adev)) > WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, > &adev->virt.flr_work), > "Failed to queue work! at %s", > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index aba00d961627..fa9d1b02f391 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -358,7 +358,7 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, > > switch (event) { > case IDH_FLR_NOTIFICATION: > - if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) > + if (amdgpu_sriov_runtime(adev)) > WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, > &adev->virt.flr_work), > "Failed to queue work! at %s", > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > index 59f53c743362..14a065516ae4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > @@ -560,7 +560,7 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev, > r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION); > > /* only handle FLR_NOTIFY now */ > - if (!r && !amdgpu_in_reset(adev)) > + if (!r) > WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, > &adev->virt.flr_work), > "Failed to queue work! at %s",