On Mon, Sep 2, 2024 at 3:34 AM Lijo Lazar <lijo.lazar@xxxxxxx> wrote: > > Move the reinitialization part after a reset to another function. No > functional changes. > > Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 150 ++++++++++++--------- > 2 files changed, 89 insertions(+), 63 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index fefdace22894..e1ae898b42eb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1276,6 +1276,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > int amdgpu_do_asic_reset(struct list_head *device_list_handle, > struct amdgpu_reset_context *reset_context); > > +int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context); > + > int emu_soc_asic_init(struct amdgpu_device *adev); > > /* > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index db5046e8b10d..e28227869307 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5440,75 +5440,25 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > return r; > } > > -int amdgpu_do_asic_reset(struct list_head *device_list_handle, > - struct amdgpu_reset_context *reset_context) > +int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) > { > - struct amdgpu_device *tmp_adev = NULL; > - bool need_full_reset, skip_hw_reset, vram_lost = false; > - int r = 0; > - > - /* Try reset handler method first */ > - tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, > - reset_list); > - > - reset_context->reset_device_list = device_list_handle; > - r = amdgpu_reset_perform_reset(tmp_adev, reset_context); > - /* If reset handler not implemented, continue; otherwise return */ > - if (r == -EOPNOTSUPP) > - r = 0; > - else > - return r; > - > - /* Reset handler not implemented, use the default method */ > - need_full_reset = > - test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); > - skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); > - > - /* > - * ASIC reset has to be done on all XGMI hive nodes ASAP > - * to allow proper links negotiation in FW (within 1 sec) > - */ > - if (!skip_hw_reset && need_full_reset) { > - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { > - /* For XGMI run all resets in parallel to speed up the process */ > - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { > - if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work)) > - r = -EALREADY; > - } else > - r = amdgpu_asic_reset(tmp_adev); > - > - if (r) { > - dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", > - r, adev_to_drm(tmp_adev)->unique); > - goto out; > - } > - } > + struct list_head *device_list_handle; > + bool full_reset, vram_lost = false; > + struct amdgpu_device *tmp_adev; > + int r; > > - /* For XGMI wait for all resets to complete before proceed */ > - if (!r) { > - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { > - if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { > - flush_work(&tmp_adev->xgmi_reset_work); > - r = tmp_adev->asic_reset_res; > - if (r) > - break; > - } > - } > - } > - } > + device_list_handle = reset_context->reset_device_list; > > - if (!r && amdgpu_ras_intr_triggered()) { > - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { > - amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); > - } > + if (!device_list_handle) > + return -EINVAL; > > - amdgpu_ras_intr_cleared(); > - } > + full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); > > + r = 0; > list_for_each_entry(tmp_adev, device_list_handle, reset_list) { > /* After reset, it's default init level */ > amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT); > - if (need_full_reset) { > + if (full_reset) { > /* post card */ > amdgpu_ras_set_fed(tmp_adev, false); > r = amdgpu_device_asic_init(tmp_adev); > @@ -5598,7 +5548,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, > r = amdgpu_ib_ring_tests(tmp_adev); > if (r) { > dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); > - need_full_reset = true; > r = -EAGAIN; > goto end; > } > @@ -5611,10 +5560,85 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, > } > > end: > - if (need_full_reset) > + return r; > +} > + > +int amdgpu_do_asic_reset(struct list_head *device_list_handle, > + struct amdgpu_reset_context *reset_context) > +{ > + struct amdgpu_device *tmp_adev = NULL; > + bool need_full_reset, skip_hw_reset; > + int r = 0; > + > + /* Try reset handler method first */ > + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, > + reset_list); > + > + reset_context->reset_device_list = device_list_handle; > + r = amdgpu_reset_perform_reset(tmp_adev, reset_context); > + /* If reset handler not implemented, continue; otherwise return */ > + if (r == -EOPNOTSUPP) > + r = 0; > + else > + return r; > + > + /* Reset handler not implemented, use the default method */ > + need_full_reset = > + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); > + skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); > + > + /* > + * ASIC reset has to be done on all XGMI hive nodes ASAP > + * to allow proper links negotiation in FW (within 1 sec) > + */ > + if (!skip_hw_reset && need_full_reset) { > + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { > + /* For XGMI run all resets in parallel to speed up the process */ > + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { > + if (!queue_work(system_unbound_wq, > + &tmp_adev->xgmi_reset_work)) > + r = -EALREADY; > + } else > + r = amdgpu_asic_reset(tmp_adev); > + > + if (r) { > + dev_err(tmp_adev->dev, > + "ASIC reset failed with error, %d for drm dev, %s", > + r, adev_to_drm(tmp_adev)->unique); > + goto out; > + } > + } > + > + /* For XGMI wait for all resets to complete before proceed */ > + if (!r) { > + list_for_each_entry(tmp_adev, device_list_handle, > + reset_list) { > + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { > + flush_work(&tmp_adev->xgmi_reset_work); > + r = tmp_adev->asic_reset_res; > + if (r) > + break; > + } > + } > + } > + } > + > + if (!r && amdgpu_ras_intr_triggered()) { > + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { > + amdgpu_ras_reset_error_count(tmp_adev, > + AMDGPU_RAS_BLOCK__MMHUB); > + } > + > + amdgpu_ras_intr_cleared(); > + } > + > + r = amdgpu_device_reinit_after_reset(reset_context); > + if (r == -EAGAIN) > set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); > else > clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); > + > +out: > return r; > } > > -- > 2.25.1 >