On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@xxxxxxx> wrote: > > Problem: > IP dump right now is done post suspend of > all IP's which for some IP's could change power > state and software state too which we do not want > to reflect in the dump as it might not be same at > the time of hang. > > Solution: > IP should be dumped as close to the HW state when > the GPU was in hung state without trying to reinitialize > any resource. > > Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx> Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++----------- > 1 file changed, 30 insertions(+), 30 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 730dae77570c..74f6f15e73b5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) > return ret; > } > > +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) > +{ > + int i; > + > + lockdep_assert_held(&adev->reset_domain->sem); > + > + for (i = 0; i < adev->reset_info.num_regs; i++) { > + adev->reset_info.reset_dump_reg_value[i] = > + RREG32(adev->reset_info.reset_dump_reg_list[i]); > + > + trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], > + adev->reset_info.reset_dump_reg_value[i]); > + } > + > + return 0; > +} > + > int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > struct amdgpu_reset_context *reset_context) > { > int i, r = 0; > struct amdgpu_job *job = NULL; > + struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; > bool need_full_reset = > test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); > > @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > } > } > > + if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { > + amdgpu_reset_reg_dumps(tmp_adev); > + > + dev_info(tmp_adev->dev, "Dumping IP State\n"); > + /* Trigger ip dump before we reset the asic */ > + for (i = 0; i < tmp_adev->num_ip_blocks; i++) > + if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) > + tmp_adev->ip_blocks[i].version->funcs->dump_ip_state( > + (void *)tmp_adev); > + dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); > + } > + > if (need_full_reset) > r = amdgpu_device_ip_suspend(adev); > if (need_full_reset) > @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, > return r; > } > > -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) > -{ > - int i; > - > - lockdep_assert_held(&adev->reset_domain->sem); > - > - for (i = 0; i < adev->reset_info.num_regs; i++) { > - adev->reset_info.reset_dump_reg_value[i] = > - RREG32(adev->reset_info.reset_dump_reg_list[i]); > - > - trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], > - adev->reset_info.reset_dump_reg_value[i]); > - } > - > - return 0; > -} > - > int amdgpu_do_asic_reset(struct list_head *device_list_handle, > struct amdgpu_reset_context *reset_context) > { > struct amdgpu_device *tmp_adev = NULL; > bool need_full_reset, skip_hw_reset, vram_lost = false; > int r = 0; > - uint32_t i; > > /* Try reset handler method first */ > tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, > reset_list); > > - if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { > - amdgpu_reset_reg_dumps(tmp_adev); > - > - dev_info(tmp_adev->dev, "Dumping IP State\n"); > - /* Trigger ip dump before we reset the asic */ > - for (i = 0; i < tmp_adev->num_ip_blocks; i++) > - if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) > - tmp_adev->ip_blocks[i].version->funcs > - ->dump_ip_state((void *)tmp_adev); > - dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); > - } > - > reset_context->reset_device_list = device_list_handle; > r = amdgpu_reset_perform_reset(tmp_adev, reset_context); > /* If reset handler not implemented, continue; otherwise return */ > -- > 2.34.1 >