Re: [PATCH] drm/amd/pm: add delay to avoid unintened shutdown due to hotspot temperature spark

Alex Deucher <alexdeucher@xxxxxxxxx> · Tue, 16 May 2023 09:07:56 -0400

On Mon, May 15, 2023 at 10:52 PM Evan Quan <evan.quan@xxxxxxx> wrote:
>
> There will be a double check for the hotspot temperature on delay
> expired. This can avoid unintended shutdown due to hotspot temperature
> spark.
>
> Signed-off-by: Evan Quan <evan.quan@xxxxxxx>
> --
> v1->v2:
>   - add the proper millidegree Celsius to degree Celsius transform
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       | 14 ++++++++
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 34 +++++++++++++++++++
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  2 ++
>  .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c    |  9 ++---
>  .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  9 ++---

Can you extend this to the older powerplay code as well?

>  6 files changed, 55 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 39192eba3ff8..4cd873659365 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -243,6 +243,7 @@ extern int amdgpu_num_kcq;
>  #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024)
>  extern int amdgpu_vcnfw_log;
>  extern int amdgpu_sg_display;
> +extern uint amdgpu_ctf_delay;
>
>  #define AMDGPU_VM_MAX_NUM_CTX                  4096
>  #define AMDGPU_SG_THRESHOLD                    (256*1024*1024)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 749eeb9a2976..6c699fefdf92 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -198,6 +198,7 @@ int amdgpu_smartshift_bias;
>  int amdgpu_use_xgmi_p2p = 1;
>  int amdgpu_vcnfw_log;
>  int amdgpu_sg_display = -1; /* auto */
> +uint amdgpu_ctf_delay = 50; /* in ms */
>
>  static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
>
> @@ -973,6 +974,19 @@ MODULE_PARM_DESC(smu_pptable_id,
>         "specify pptable id to be used (-1 = auto(default) value, 0 = use pptable from vbios, > 0 = soft pptable id)");
>  module_param_named(smu_pptable_id, amdgpu_smu_pptable_id, int, 0444);
>
> +/**
> + * DOC: ctf_delay (uint)
> + * On SW CTF triggerred, to protect the chip from over-heated and possible damage, we usually
> + * trigger a system shutdown. However, considering there may be a hotspot temperature spark
> + * momentarily hitting the SW CTF setting point, a delay is added to avoid unintended shutdown.
> + * On the delay expired, the shutdown will be performed if the hotspot temp is still
> + * bigger than the SW CTF setting. Otherwise, nothing will be done.
> + * The default setting for the delay is 50ms.
> + */
> +MODULE_PARM_DESC(ctf_delay,
> +               "the delay(default 50ms) enforced before real action taken on ctf triggerred");
> +module_param_named(ctf_delay, amdgpu_ctf_delay, uint, 0444);

I think we can probably drop this.  I don't see a need for users to
adjust this and it could be bad for the hardware if it gets set too
long.

> +
>  /* These devices are not supported by amdgpu.
>   * They are supported by the mach64, r128, radeon drivers
>   */
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 3c860939031e..71153b335ad9 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -24,6 +24,7 @@
>
>  #include <linux/firmware.h>
>  #include <linux/pci.h>
> +#include <linux/reboot.h>
>
>  #include "amdgpu.h"
>  #include "amdgpu_smu.h"
> @@ -1070,6 +1071,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)
>                 smu->ppt_funcs->interrupt_work(smu);
>  }
>
> +static void smu_swctf_delayed_work_handler(struct work_struct *work)
> +{
> +       struct smu_context *smu =
> +               container_of(work, struct smu_context, swctf_delayed_work.work);
> +       struct smu_temperature_range *range =
> +                               &smu->thermal_range;
> +       struct amdgpu_device *adev = smu->adev;
> +       uint32_t hotspot_tmp, size;
> +
> +       /*
> +        * If the hotspot temperature is confirmed as below SW CTF setting point
> +        * after the delay enforced, nothing will be done.
> +        * Otherwise, a graceful shutdown will be performed to prevent further damage.
> +        */
> +       if (smu->ppt_funcs->read_sensor &&
> +           !smu->ppt_funcs->read_sensor(smu,
> +                                        AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
> +                                        &hotspot_tmp,
> +                                        &size) &&
> +           range->software_shutdown_temp &&
> +           hotspot_tmp / 1000 < range->software_shutdown_temp)
> +               return;
> +
> +       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> +       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> +       orderly_poweroff(true);
> +}
> +
>  static int smu_sw_init(void *handle)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -1358,6 +1387,9 @@ static int smu_smc_hw_setup(struct smu_context *smu)
>                 return ret;
>         }
>
> +       INIT_DELAYED_WORK(&smu->swctf_delayed_work,
> +                         smu_swctf_delayed_work_handler);
> +
>         ret = smu_enable_thermal_alert(smu);
>         if (ret) {
>           dev_err(adev->dev, "Failed to enable thermal alert!\n");
> @@ -1592,6 +1624,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
>                 return ret;
>         }
>
> +       cancel_delayed_work_sync(&smu->swctf_delayed_work);
> +
>         ret = smu_disable_dpms(smu);
>         if (ret) {
>                 dev_err(adev->dev, "Fail to disable dpm features!\n");
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index 4ce394903c07..18101ec0ae6e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -573,6 +573,8 @@ struct smu_context
>         u32 debug_param_reg;
>         u32 debug_msg_reg;
>         u32 debug_resp_reg;
> +
> +       struct delayed_work             swctf_delayed_work;
>  };
>
>  struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> index e1ef88ee1ed3..4c3c682bf7a0 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> @@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
>         if (client_id == SOC15_IH_CLIENTID_THM) {
>                 switch (src_id) {
>                 case THM_11_0__SRCID__THM_DIG_THERM_L2H:
> -                       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> -                       /*
> -                        * SW CTF just occurred.
> -                        * Try to do a graceful shutdown to prevent further damage.
> -                        */
> -                       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> -                       orderly_poweroff(true);
> +                       schedule_delayed_work(&smu->swctf_delayed_work,
> +                                             msecs_to_jiffies(amdgpu_ctf_delay));
>                 break;
>                 case THM_11_0__SRCID__THM_DIG_THERM_H2L:
>                         dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index 0bc0a6e97b5a..a5447119d5f5 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -1377,13 +1377,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev,
>         if (client_id == SOC15_IH_CLIENTID_THM) {
>                 switch (src_id) {
>                 case THM_11_0__SRCID__THM_DIG_THERM_L2H:
> -                       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> -                       /*
> -                        * SW CTF just occurred.
> -                        * Try to do a graceful shutdown to prevent further damage.
> -                        */
> -                       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> -                       orderly_poweroff(true);
> +                       schedule_delayed_work(&smu->swctf_delayed_work,
> +                                             msecs_to_jiffies(amdgpu_ctf_delay));
>                         break;
>                 case THM_11_0__SRCID__THM_DIG_THERM_H2L:
>                         dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
> --
> 2.34.1
>