Re: [PATCH 2/2] drm/amdgpu: trigger ip dump before suspend of IP's

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jul 26, 2024 at 8:48 AM Sunil Khatri <sunil.khatri@xxxxxxx> wrote:
>
> Problem:
> IP dump right now is done post suspend of
> all IP's which for some IP's could change power
> state and software state too which we do not want
> to reflect in the dump as it might not be same at
> the time of hang.
>
> Solution:
> IP should be dumped as close to the HW state when
> the GPU was in hung state without trying to reinitialize
> any resource.
>
> Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx>

Acked-by: Alex Deucher <alexander.deucher@xxxxxxx>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++-----------
>  1 file changed, 30 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 730dae77570c..74f6f15e73b5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5277,11 +5277,29 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
>         return ret;
>  }
>
> +static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> +{
> +       int i;
> +
> +       lockdep_assert_held(&adev->reset_domain->sem);
> +
> +       for (i = 0; i < adev->reset_info.num_regs; i++) {
> +               adev->reset_info.reset_dump_reg_value[i] =
> +                       RREG32(adev->reset_info.reset_dump_reg_list[i]);
> +
> +               trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> +                                            adev->reset_info.reset_dump_reg_value[i]);
> +       }
> +
> +       return 0;
> +}
> +
>  int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>                                  struct amdgpu_reset_context *reset_context)
>  {
>         int i, r = 0;
>         struct amdgpu_job *job = NULL;
> +       struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
>         bool need_full_reset =
>                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
>
> @@ -5340,6 +5358,18 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>                         }
>                 }
>
> +               if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
> +                       amdgpu_reset_reg_dumps(tmp_adev);
> +
> +                       dev_info(tmp_adev->dev, "Dumping IP State\n");
> +                       /* Trigger ip dump before we reset the asic */
> +                       for (i = 0; i < tmp_adev->num_ip_blocks; i++)
> +                               if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> +                                       tmp_adev->ip_blocks[i].version->funcs->dump_ip_state(
> +                                                       (void *)tmp_adev);
> +                       dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
> +               }
> +
>                 if (need_full_reset)
>                         r = amdgpu_device_ip_suspend(adev);
>                 if (need_full_reset)
> @@ -5352,47 +5382,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>         return r;
>  }
>
> -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> -{
> -       int i;
> -
> -       lockdep_assert_held(&adev->reset_domain->sem);
> -
> -       for (i = 0; i < adev->reset_info.num_regs; i++) {
> -               adev->reset_info.reset_dump_reg_value[i] =
> -                       RREG32(adev->reset_info.reset_dump_reg_list[i]);
> -
> -               trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
> -                                            adev->reset_info.reset_dump_reg_value[i]);
> -       }
> -
> -       return 0;
> -}
> -
>  int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>                          struct amdgpu_reset_context *reset_context)
>  {
>         struct amdgpu_device *tmp_adev = NULL;
>         bool need_full_reset, skip_hw_reset, vram_lost = false;
>         int r = 0;
> -       uint32_t i;
>
>         /* Try reset handler method first */
>         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
>                                     reset_list);
>
> -       if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
> -               amdgpu_reset_reg_dumps(tmp_adev);
> -
> -               dev_info(tmp_adev->dev, "Dumping IP State\n");
> -               /* Trigger ip dump before we reset the asic */
> -               for (i = 0; i < tmp_adev->num_ip_blocks; i++)
> -                       if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
> -                               tmp_adev->ip_blocks[i].version->funcs
> -                               ->dump_ip_state((void *)tmp_adev);
> -               dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
> -       }
> -
>         reset_context->reset_device_list = device_list_handle;
>         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
>         /* If reset handler not implemented, continue; otherwise return */
> --
> 2.34.1
>




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux