On Mon, May 15, 2023 at 10:52 PM Evan Quan <evan.quan@xxxxxxx> wrote: > > There will be a double check for the hotspot temperature on delay > expired. This can avoid unintended shutdown due to hotspot temperature > spark. > > Signed-off-by: Evan Quan <evan.quan@xxxxxxx> > -- > v1->v2: > - add the proper millidegree Celsius to degree Celsius transform > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 14 ++++++++ > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 34 +++++++++++++++++++ > drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 2 ++ > .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 9 ++--- > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 9 ++--- Can you extend this to the older powerplay code as well? > 6 files changed, 55 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 39192eba3ff8..4cd873659365 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -243,6 +243,7 @@ extern int amdgpu_num_kcq; > #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024) > extern int amdgpu_vcnfw_log; > extern int amdgpu_sg_display; > +extern uint amdgpu_ctf_delay; > > #define AMDGPU_VM_MAX_NUM_CTX 4096 > #define AMDGPU_SG_THRESHOLD (256*1024*1024) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 749eeb9a2976..6c699fefdf92 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -198,6 +198,7 @@ int amdgpu_smartshift_bias; > int amdgpu_use_xgmi_p2p = 1; > int amdgpu_vcnfw_log; > int amdgpu_sg_display = -1; /* auto */ > +uint amdgpu_ctf_delay = 50; /* in ms */ > > static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work); > > @@ -973,6 +974,19 @@ MODULE_PARM_DESC(smu_pptable_id, > "specify pptable id to be used (-1 = auto(default) value, 0 = use pptable from vbios, > 0 = soft pptable id)"); > module_param_named(smu_pptable_id, amdgpu_smu_pptable_id, int, 0444); > > +/** > + * DOC: ctf_delay (uint) > + * On SW CTF triggerred, to protect the chip from over-heated and possible damage, we usually > + * trigger a system shutdown. However, considering there may be a hotspot temperature spark > + * momentarily hitting the SW CTF setting point, a delay is added to avoid unintended shutdown. > + * On the delay expired, the shutdown will be performed if the hotspot temp is still > + * bigger than the SW CTF setting. Otherwise, nothing will be done. > + * The default setting for the delay is 50ms. > + */ > +MODULE_PARM_DESC(ctf_delay, > + "the delay(default 50ms) enforced before real action taken on ctf triggerred"); > +module_param_named(ctf_delay, amdgpu_ctf_delay, uint, 0444); I think we can probably drop this. I don't see a need for users to adjust this and it could be bad for the hardware if it gets set too long. > + > /* These devices are not supported by amdgpu. > * They are supported by the mach64, r128, radeon drivers > */ > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > index 3c860939031e..71153b335ad9 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > @@ -24,6 +24,7 @@ > > #include <linux/firmware.h> > #include <linux/pci.h> > +#include <linux/reboot.h> > > #include "amdgpu.h" > #include "amdgpu_smu.h" > @@ -1070,6 +1071,34 @@ static void smu_interrupt_work_fn(struct work_struct *work) > smu->ppt_funcs->interrupt_work(smu); > } > > +static void smu_swctf_delayed_work_handler(struct work_struct *work) > +{ > + struct smu_context *smu = > + container_of(work, struct smu_context, swctf_delayed_work.work); > + struct smu_temperature_range *range = > + &smu->thermal_range; > + struct amdgpu_device *adev = smu->adev; > + uint32_t hotspot_tmp, size; > + > + /* > + * If the hotspot temperature is confirmed as below SW CTF setting point > + * after the delay enforced, nothing will be done. > + * Otherwise, a graceful shutdown will be performed to prevent further damage. > + */ > + if (smu->ppt_funcs->read_sensor && > + !smu->ppt_funcs->read_sensor(smu, > + AMDGPU_PP_SENSOR_HOTSPOT_TEMP, > + &hotspot_tmp, > + &size) && > + range->software_shutdown_temp && > + hotspot_tmp / 1000 < range->software_shutdown_temp) > + return; > + > + dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); > + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); > + orderly_poweroff(true); > +} > + > static int smu_sw_init(void *handle) > { > struct amdgpu_device *adev = (struct amdgpu_device *)handle; > @@ -1358,6 +1387,9 @@ static int smu_smc_hw_setup(struct smu_context *smu) > return ret; > } > > + INIT_DELAYED_WORK(&smu->swctf_delayed_work, > + smu_swctf_delayed_work_handler); > + > ret = smu_enable_thermal_alert(smu); > if (ret) { > dev_err(adev->dev, "Failed to enable thermal alert!\n"); > @@ -1592,6 +1624,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu) > return ret; > } > > + cancel_delayed_work_sync(&smu->swctf_delayed_work); > + > ret = smu_disable_dpms(smu); > if (ret) { > dev_err(adev->dev, "Fail to disable dpm features!\n"); > diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > index 4ce394903c07..18101ec0ae6e 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h > @@ -573,6 +573,8 @@ struct smu_context > u32 debug_param_reg; > u32 debug_msg_reg; > u32 debug_resp_reg; > + > + struct delayed_work swctf_delayed_work; > }; > > struct i2c_adapter; > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c > index e1ef88ee1ed3..4c3c682bf7a0 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c > @@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, > if (client_id == SOC15_IH_CLIENTID_THM) { > switch (src_id) { > case THM_11_0__SRCID__THM_DIG_THERM_L2H: > - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); > - /* > - * SW CTF just occurred. > - * Try to do a graceful shutdown to prevent further damage. > - */ > - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); > - orderly_poweroff(true); > + schedule_delayed_work(&smu->swctf_delayed_work, > + msecs_to_jiffies(amdgpu_ctf_delay)); > break; > case THM_11_0__SRCID__THM_DIG_THERM_H2L: > dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > index 0bc0a6e97b5a..a5447119d5f5 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c > @@ -1377,13 +1377,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev, > if (client_id == SOC15_IH_CLIENTID_THM) { > switch (src_id) { > case THM_11_0__SRCID__THM_DIG_THERM_L2H: > - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); > - /* > - * SW CTF just occurred. > - * Try to do a graceful shutdown to prevent further damage. > - */ > - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); > - orderly_poweroff(true); > + schedule_delayed_work(&smu->swctf_delayed_work, > + msecs_to_jiffies(amdgpu_ctf_delay)); > break; > case THM_11_0__SRCID__THM_DIG_THERM_H2L: > dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); > -- > 2.34.1 >