RE: [PATCH 2/3] drm/amdgpu: avoid sending csib command when system resumes from S3

"Yuan, Perry" <Perry.Yuan@xxxxxxx> · Tue, 24 Oct 2023 14:45:52 +0000

[AMD Official Use Only - General]

Hi Kevin,

> -----Original Message-----
> From: Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>
> Sent: Tuesday, October 24, 2023 1:24 PM
> To: Yuan, Perry <Perry.Yuan@xxxxxxx>; Zhang, Yifan
> <Yifan1.Zhang@xxxxxxx>; Feng, Kenneth <Kenneth.Feng@xxxxxxx>;
> Limonciello, Mario <Mario.Limonciello@xxxxxxx>
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; amd-
> gfx@xxxxxxxxxxxxxxxxxxxxx
> Subject: RE: [PATCH 2/3] drm/amdgpu: avoid sending csib command when
> system resumes from S3
>
> [AMD Official Use Only - General]
>
> -----Original Message-----
> From: Yuan, Perry <Perry.Yuan@xxxxxxx>
> Sent: Tuesday, October 24, 2023 10:33 AM
> To: Zhang, Yifan <Yifan1.Zhang@xxxxxxx>; Feng, Kenneth
> <Kenneth.Feng@xxxxxxx>; Limonciello, Mario
> <Mario.Limonciello@xxxxxxx>
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Wang, Yang(Kevin)
> <KevinYang.Wang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Subject: [PATCH 2/3] drm/amdgpu: avoid sending csib command when
> system resumes from S3
>
> Previously the CSIB command pocket was sent to GFX block while amdgpu
> driver loading or S3 resuming time all the time.
> As the CP protocol required, the CSIB is not needed to send again while GC is
> not powered down while resuming from aborted S3 suspend sequence.
>
> PREAMBLE_CNTL packet coming in the ring after PG event where the RLC
> already sent its copy of CSIB, send another CSIB pocket will cause Gfx IB
> testing timeout when system resume from S3.
>
> Add flag `csib_initialized` to make sure normal S3 suspend/resume will
> initialize csib normally, when system abort to S3 suspend and resume
> immediately because of some failed suspend callback, GPU is not power
> down at that time, so csib command is not needed to send again.
>
> Error dmesg log:
> amdgpu 0000:04:00.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB
> test failed on gfx_0.0.0 (-110).
> [drm:amdgpu_device_delayed_init_work_handler [amdgpu]] *ERROR* ib
> ring test failed (-110).
> PM: resume of devices complete after 2373.995 msecs
> PM: Finishing wakeup.
>
> Signed-off-by: Perry Yuan <perry.yuan@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  5 +++++
> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 29 ++++++++++++++++++----
> ---
>  3 files changed, 27 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 44df1a5bce7f..e5d85ea26a5e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1114,6 +1114,7 @@ struct amdgpu_device {
>         bool                            debug_vm;
>         bool                            debug_largebar;
>         bool                            debug_disable_soft_recovery;
> +       bool                            csib_initialized;
> [Kevin]:
> you'd better use space to instead of "tab" , to align with other field.

Cool, I didn`t notice that, changed in v2.
Thanks !

>
>  };
>
>  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 420196a17e22..a47c9f840754 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2468,6 +2468,11 @@ static int amdgpu_pmops_suspend_noirq(struct
> device *dev)
>         if (amdgpu_acpi_should_gpu_reset(adev))
>                 return amdgpu_asic_reset(adev);
>
> +       /* update flag to make sure csib will be sent when system
> +        * resume from normal S3
> +        */
> +       adev->csib_initialized = false;
> +
>         return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 6399bc71c56d..ab2e3e592dfc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -3481,6 +3481,7 @@ static uint64_t
> gfx_v10_0_get_gpu_clock_counter(struct amdgpu_device *adev);  static
> void gfx_v10_0_select_se_sh(struct amdgpu_device *adev, u32 se_num,
>                                    u32 sh_num, u32 instance, int xcc_id);  static u32
> gfx_v10_0_get_wgp_active_bitmap_per_sh(struct amdgpu_device *adev);
> +static int gfx_v10_0_wait_for_idle(void *handle);
>
>  static int gfx_v10_0_rlc_backdoor_autoload_buffer_init(struct
> amdgpu_device *adev);  static void
> gfx_v10_0_rlc_backdoor_autoload_buffer_fini(struct amdgpu_device
> *adev); @@ -5958,7 +5959,7 @@ static int
> gfx_v10_0_cp_gfx_load_microcode(struct amdgpu_device *adev)
>         return 0;
>  }
>
> -static int gfx_v10_0_cp_gfx_start(struct amdgpu_device *adev)
> +static int gfx_v10_csib_submit(struct amdgpu_device *adev)
>  {
>         struct amdgpu_ring *ring;
>         const struct cs_section_def *sect = NULL; @@ -5966,13 +5967,6 @@
> static int gfx_v10_0_cp_gfx_start(struct amdgpu_device *adev)
>         int r, i;
>         int ctx_reg_offset;
>
> -       /* init the CP */
> -       WREG32_SOC15(GC, 0, mmCP_MAX_CONTEXT,
> -                    adev->gfx.config.max_hw_contexts - 1);
> -       WREG32_SOC15(GC, 0, mmCP_DEVICE_ID, 1);
> -
> -       gfx_v10_0_cp_gfx_enable(adev, true);
> -
>         ring = &adev->gfx.gfx_ring[0];
>         r = amdgpu_ring_alloc(ring, gfx_v10_0_get_csb_size(adev) + 4);
>         if (r) {
> @@ -6035,6 +6029,25 @@ static int gfx_v10_0_cp_gfx_start(struct
> amdgpu_device *adev)
>
>                 amdgpu_ring_commit(ring);
>         }
> +
> +       gfx_v10_0_wait_for_idle(adev);
> [kevin]:
> Do you forgot to check return value here?  If you want to ignore the return
> result, you'd better put some comments here.
> Thanks.
>
> Best Regards,
> Kevin

It is better to add check, changed in v2.
Thanks.

>
> +       adev->csib_initialized = true;
> +
> +       return 0;
> +};
> +
> +static int gfx_v10_0_cp_gfx_start(struct amdgpu_device *adev) {
> +       /* init the CP */
> +       WREG32_SOC15(GC, 0, mmCP_MAX_CONTEXT,
> +                    adev->gfx.config.max_hw_contexts - 1);
> +       WREG32_SOC15(GC, 0, mmCP_DEVICE_ID, 1);
> +
> +       gfx_v10_0_cp_gfx_enable(adev, true);
> +
> +       if (!adev->csib_initialized)
> +               gfx_v10_csib_submit(adev);
> +
>         return 0;
>  }
>
> --
> 2.34.1
>