RE: [PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

"Quan, Evan" <Evan.Quan@xxxxxxx> · Tue, 27 Jun 2023 03:32:54 +0000



[AMD Official Use Only - General]

> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
> Sent: Monday, June 26, 2023 7:54 PM
> To: Quan, Evan <Evan.Quan@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>
> Subject: Re: [PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to
> temperature momentary fluctuation
>
>
>
> On 6/26/2023 1:17 PM, Evan Quan wrote:
> > An intentional delay is added on soft ctf triggered. Then there will
> > be a double check for the GPU temperature before taking further
> > action. This can avoid unintended shutdown due to temperature
> > momentary fluctuation.
> >
> > Signed-off-by: Evan Quan <evan.quan@xxxxxxx>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  3 ++
> >   .../gpu/drm/amd/pm/powerplay/amd_powerplay.c  | 48
> +++++++++++++++++++
> >   .../drm/amd/pm/powerplay/hwmgr/smu_helper.c   | 27 ++++-------
> >   drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h  |  2 +
> >   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 34
> +++++++++++++
> >   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  2 +
> >   .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c    |  9 +---
> >   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  9 +---
> >   8 files changed, 102 insertions(+), 32 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index e459381dc759..5ef1f31e703c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -287,6 +287,9 @@ extern int amdgpu_user_partt_mode;
> >   #define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
> >   #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
> >
> > +/* Extra time delay(in ms) to eliminate the influence of temperature
> momentary fluctuation */
> > +#define AMDGPU_SWCTF_EXTRA_DELAY           50
>
> I think a delay of 10-15ms is good enough to filter out any spike.
50ms is required by our CE team for supporting the customer. It is also aligned with Windows side.
Considering we cannot guard that(10-15ms is good), I think it's better to stick to the 50ms recommended setting.
How do you think?

Evan
>
> With that change, the series is
>       Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx>
>
> Thanks,
> Lijo
>
> > +
> >   struct amdgpu_xcp_mgr;
> >   struct amdgpu_device;
> >   struct amdgpu_irq_src;
> > diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> > b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> > index 11b7b4cffaae..ff360c699171 100644
> > --- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> > +++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> > @@ -26,6 +26,7 @@
> >   #include <linux/gfp.h>
> >   #include <linux/slab.h>
> >   #include <linux/firmware.h>
> > +#include <linux/reboot.h>
> >   #include "amd_shared.h"
> >   #include "amd_powerplay.h"
> >   #include "power_state.h"
> > @@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
> >     return 0;
> >   }
> >
> > +static void pp_swctf_delayed_work_handler(struct work_struct *work) {
> > +   struct pp_hwmgr *hwmgr =
> > +           container_of(work, struct pp_hwmgr,
> swctf_delayed_work.work);
> > +   struct amdgpu_device *adev = hwmgr->adev;
> > +   struct amdgpu_dpm_thermal *range =
> > +                           &adev->pm.dpm.thermal;
> > +   uint32_t gpu_temperature, size;
> > +   int ret;
> > +
> > +   /*
> > +    * If the hotspot/edge temperature is confirmed as below SW CTF
> setting point
> > +    * after the delay enforced, nothing will be done.
> > +    * Otherwise, a graceful shutdown will be performed to prevent
> further damage.
> > +    */
> > +   if (range->sw_ctf_threshold &&
> > +       hwmgr->hwmgr_func->read_sensor) {
> > +           ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
> > +
> AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
> > +                                                &gpu_temperature,
> > +                                                &size);
> > +           /*
> > +            * For some legacy ASICs, hotspot temperature retrieving
> might be not
> > +            * supported. Check the edge temperature instead then.
> > +            */
> > +           if (ret == -EOPNOTSUPP)
> > +                   ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
> > +
> AMDGPU_PP_SENSOR_EDGE_TEMP,
> > +
> &gpu_temperature,
> > +                                                        &size);
> > +           if (!ret && gpu_temperature / 1000 < range-
> >sw_ctf_threshold)
> > +                   return;
> > +   }
> > +
> > +   dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW
> CTF) detected!\n");
> > +   dev_emerg(adev->dev, "ERROR: System is going to shutdown due to
> GPU SW CTF!\n");
> > +   orderly_poweroff(true);
> > +}
> > +
> >   static int pp_sw_init(void *handle)
> >   {
> >     struct amdgpu_device *adev = handle; @@ -101,6 +141,10 @@ static
> > int pp_sw_init(void *handle)
> >
> >     pr_debug("powerplay sw init %s\n", ret ? "failed" :
> > "successfully");
> >
> > +   if (!ret)
> > +           INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
> > +                             pp_swctf_delayed_work_handler);
> > +
> >     return ret;
> >   }
> >
> > @@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)
> >     struct amdgpu_device *adev = handle;
> >     struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
> >
> > +   cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
> > +
> >     hwmgr_hw_fini(hwmgr);
> >
> >     return 0;
> > @@ -221,6 +267,8 @@ static int pp_suspend(void *handle)
> >     struct amdgpu_device *adev = handle;
> >     struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
> >
> > +   cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
> > +
> >     return hwmgr_suspend(hwmgr);
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
> > b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
> > index bfe80ac0ad8c..d0b1ab6c4523 100644
> > --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
> > +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
> > @@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device
> *adev,
> >                        struct amdgpu_irq_src *source,
> >                        struct amdgpu_iv_entry *entry)
> >   {
> > +   struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
> >     uint32_t client_id = entry->client_id;
> >     uint32_t src_id = entry->src_id;
> >
> >     if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {
> >             if (src_id ==
> VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {
> > -                   dev_emerg(adev->dev, "ERROR: GPU over
> temperature range(SW CTF) detected!\n");
> > -                   /*
> > -                    * SW CTF just occurred.
> > -                    * Try to do a graceful shutdown to prevent further
> damage.
> > -                    */
> > -                   dev_emerg(adev->dev, "ERROR: System is going to
> shutdown due to GPU SW CTF!\n");
> > -                   orderly_poweroff(true);
> > -           } else if (src_id ==
> VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
> > +                   schedule_delayed_work(&hwmgr-
> >swctf_delayed_work,
> > +
> msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
> > +           } else if (src_id ==
> > +VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {
> >                     dev_emerg(adev->dev, "ERROR: GPU under
> temperature range detected!\n");
> > -           else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
> > +           } else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
> >                     dev_emerg(adev->dev, "ERROR: GPU HW Critical
> Temperature Fault(aka CTF) detected!\n");
> >                     /*
> >                      * HW CTF just occurred. Shutdown to prevent further
> damage.
> > @@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device
> *adev,
> >                     orderly_poweroff(true);
> >             }
> >     } else if (client_id == SOC15_IH_CLIENTID_THM) {
> > -           if (src_id == 0) {
> > -                   dev_emerg(adev->dev, "ERROR: GPU over
> temperature range(SW CTF) detected!\n");
> > -                   /*
> > -                    * SW CTF just occurred.
> > -                    * Try to do a graceful shutdown to prevent further
> damage.
> > -                    */
> > -                   dev_emerg(adev->dev, "ERROR: System is going to
> shutdown due to GPU SW CTF!\n");
> > -                   orderly_poweroff(true);
> > -           } else
> > +           if (src_id == 0)
> > +                   schedule_delayed_work(&hwmgr-
> >swctf_delayed_work,
> > +
> msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
> > +           else
> >                     dev_emerg(adev->dev, "ERROR: GPU under
> temperature range detected!\n");
> >     } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
> >             dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature
> Fault(aka
> > CTF) detected!\n"); diff --git
> > a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
> > b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
> > index f1580a26a850..612d66aeaab9 100644
> > --- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
> > +++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
> > @@ -811,6 +811,8 @@ struct pp_hwmgr {
> >     bool gfxoff_state_changed_by_workload;
> >     uint32_t pstate_sclk_peak;
> >     uint32_t pstate_mclk_peak;
> > +
> > +   struct delayed_work swctf_delayed_work;
> >   };
> >
> >   int hwmgr_early_init(struct pp_hwmgr *hwmgr); diff --git
> > a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > index 2aecb8faf407..4062b9b46986 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > @@ -24,6 +24,7 @@
> >
> >   #include <linux/firmware.h>
> >   #include <linux/pci.h>
> > +#include <linux/reboot.h>
> >
> >   #include "amdgpu.h"
> >   #include "amdgpu_smu.h"
> > @@ -1078,6 +1079,34 @@ static void smu_interrupt_work_fn(struct
> work_struct *work)
> >             smu->ppt_funcs->interrupt_work(smu);
> >   }
> >
> > +static void smu_swctf_delayed_work_handler(struct work_struct *work)
> > +{
> > +   struct smu_context *smu =
> > +           container_of(work, struct smu_context,
> swctf_delayed_work.work);
> > +   struct smu_temperature_range *range =
> > +                           &smu->thermal_range;
> > +   struct amdgpu_device *adev = smu->adev;
> > +   uint32_t hotspot_tmp, size;
> > +
> > +   /*
> > +    * If the hotspot temperature is confirmed as below SW CTF setting
> point
> > +    * after the delay enforced, nothing will be done.
> > +    * Otherwise, a graceful shutdown will be performed to prevent
> further damage.
> > +    */
> > +   if (range->software_shutdown_temp &&
> > +       smu->ppt_funcs->read_sensor &&
> > +       !smu->ppt_funcs->read_sensor(smu,
> > +
> AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
> > +                                    &hotspot_tmp,
> > +                                    &size) &&
> > +       hotspot_tmp / 1000 < range->software_shutdown_temp)
> > +           return;
> > +
> > +   dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW
> CTF) detected!\n");
> > +   dev_emerg(adev->dev, "ERROR: System is going to shutdown due to
> GPU SW CTF!\n");
> > +   orderly_poweroff(true);
> > +}
> > +
> >   static int smu_sw_init(void *handle)
> >   {
> >     struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@
> > -1120,6 +1149,9 @@ static int smu_sw_init(void *handle)
> >     smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
> >     smu->smu_dpm.requested_dpm_level =
> AMD_DPM_FORCED_LEVEL_AUTO;
> >
> > +   INIT_DELAYED_WORK(&smu->swctf_delayed_work,
> > +                     smu_swctf_delayed_work_handler);
> > +
> >     ret = smu_smc_table_sw_init(smu);
> >     if (ret) {
> >             dev_err(adev->dev, "Failed to sw init smc table!\n"); @@ -
> 1600,6
> > +1632,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
> >             return ret;
> >     }
> >
> > +   cancel_delayed_work_sync(&smu->swctf_delayed_work);
> > +
> >     ret = smu_disable_dpms(smu);
> >     if (ret) {
> >             dev_err(adev->dev, "Fail to disable dpm features!\n"); diff --git
> > a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > index 4ce394903c07..18101ec0ae6e 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> > @@ -573,6 +573,8 @@ struct smu_context
> >     u32 debug_param_reg;
> >     u32 debug_msg_reg;
> >     u32 debug_resp_reg;
> > +
> > +   struct delayed_work             swctf_delayed_work;
> >   };
> >
> >   struct i2c_adapter;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> > index e1ef88ee1ed3..aa4a5498a12f 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> > @@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct
> amdgpu_device *adev,
> >     if (client_id == SOC15_IH_CLIENTID_THM) {
> >             switch (src_id) {
> >             case THM_11_0__SRCID__THM_DIG_THERM_L2H:
> > -                   dev_emerg(adev->dev, "ERROR: GPU over
> temperature range(SW CTF) detected!\n");
> > -                   /*
> > -                    * SW CTF just occurred.
> > -                    * Try to do a graceful shutdown to prevent further
> damage.
> > -                    */
> > -                   dev_emerg(adev->dev, "ERROR: System is going to
> shutdown due to GPU SW CTF!\n");
> > -                   orderly_poweroff(true);
> > +                   schedule_delayed_work(&smu->swctf_delayed_work,
> > +
> msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
> >             break;
> >             case THM_11_0__SRCID__THM_DIG_THERM_H2L:
> >                     dev_emerg(adev->dev, "ERROR: GPU under
> temperature range
> > detected\n"); diff --git
> > a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > index 9e8414037638..4071bfa38727 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > @@ -1353,13 +1353,8 @@ static int smu_v13_0_irq_process(struct
> amdgpu_device *adev,
> >     if (client_id == SOC15_IH_CLIENTID_THM) {
> >             switch (src_id) {
> >             case THM_11_0__SRCID__THM_DIG_THERM_L2H:
> > -                   dev_emerg(adev->dev, "ERROR: GPU over
> temperature range(SW CTF) detected!\n");
> > -                   /*
> > -                    * SW CTF just occurred.
> > -                    * Try to do a graceful shutdown to prevent further
> damage.
> > -                    */
> > -                   dev_emerg(adev->dev, "ERROR: System is going to
> shutdown due to GPU SW CTF!\n");
> > -                   orderly_poweroff(true);
> > +                   schedule_delayed_work(&smu->swctf_delayed_work,
> > +
> msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
> >                     break;
> >             case THM_11_0__SRCID__THM_DIG_THERM_H2L:
> >                     dev_emerg(adev->dev, "ERROR: GPU under
> temperature range
> > detected\n");