Hi Kevin,
smu11_thermal_policy provides the default thermal policy if the ASIC does not provide its own(e.g. Navi10). It’s not ASIC specific.
Btw I think we should have enough information to provide Navi10’s own settings(for temp2/3) in navi10_get_thermal_temperature_range().
Can you help to add them?
Please refer to the following descriptions for those thermal interfaces.
* - temp[1-3]_input: the on die GPU temperature in millidegrees Celsius
* - temp2_input and temp3_input are supported on SOC15 dGPUs only
*
* - temp[1-3]_label: temperature channel label
* - temp2_label and temp3_label are supported on SOC15 dGPUs only
*
* - temp[1-3]_crit: temperature critical max value in millidegrees Celsius
* - temp2_crit and temp3_crit are supported on SOC15 dGPUs only
*
* - temp[1-3]_crit_hyst: temperature hysteresis for critical limit in millidegrees Celsius
* - temp2_crit_hyst and temp3_crit_hyst are supported on SOC15 dGPUs only
*
* - temp[1-3]_emergency: temperature emergency max value(asic shutdown) in millidegrees Celsius
* - these are supported on SOC15 dGPUs only
Regards,
Evan
From: Wang, Kevin(Yang) <Kevin1.Wang@xxxxxxx>
Sent: Tuesday, August 20, 2019 1:44 PM
To: Feng, Kenneth <Kenneth.Feng@xxxxxxx>; Quan, Evan <Evan.Quan@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Subject: Re: [PATCH] drm/amd/powerplay: correct SW smu11 thermal range settings
each asic maybe has different thermal policy, you can custom this value in asic file <arcturus_ppt.c> .
and your patch define a new array in smu_v11_0.h header file.
it's never done that before, and the code looks is not clearly.
Reviewed-by: Kenneth Feng <kenneth.feng@xxxxxxx>
Ping..
Yes, the lowest settings for thermal controller is 0.
Regards
Evan
Hi Evan,
So due to the below code, we don’t get a chance to set -273.15, right?
+ low = max(SMU_THERMAL_MINIMUM_ALERT_TEMP,
+ range.min / SMU_TEMPERATURE_UNITS_PER_CENTIGRADES);
+ high = min(SMU_THERMAL_MAXIMUM_ALERT_TEMP,
+ range.max / SMU_TEMPERATURE_UNITS_PER_CENTIGRADES);
[CAUTION: External Email]
Comment inline
the temperature min value should be 0, not -273 on smu11.
you can refrence window driver code or register spec.
output_ptr->operating_temperature_min_Limit = 0;
output_ptr->operating_temperature_max_Limit = ppt_info->software_shutdown_temp;
[Quan, Evan] There was a discussion over the min value(0 or -273.15) and we decided to use the later considering the OD case.
All the existing and coming ASICs should follow this design.
and in smu11, the thermal control has a 8bit register to set min and max value, and the unit is temperature.
[Quan, Evan] That is still honored, no violation here.
so there is something wrong with this patch.
Problems with current settings:
1. The min value was overrided to 0 on Vega20 & Navi10. While
the expected should be -273.15 C.
2. The thermal min/max threshold was output in wrong unit on
Navi10 & Arcturus. As TEMP_RANGE_MIN/MAX is already in
millicelsius. And "*1000" in smu_v11_0_start_thermal_control
makes the output wrongly.
Change-Id: I2f1866edd1baf264f521310343f492eaede26c33
Signed-off-by: Evan Quan <evan.quan@xxxxxxx>
---
drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 10 ----
drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h | 6 +++
drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 5 +-
drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 51 +++++++------------
drivers/gpu/drm/amd/powerplay/vega20_ppt.c | 20 +++++---
5 files changed, 38 insertions(+), 54 deletions(-)
diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index 4060607fbb35..1a1f64a9e1e0 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -880,23 +880,14 @@ static int arcturus_force_clk_levels(struct smu_context *smu,
return ret;
}
-static const struct smu_temperature_range arcturus_thermal_policy[] =
-{
- {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, 99000},
- { 120000, 120000, 120000, 120000, 120000, 120000, 120000, 120000, 120000},
-};
-
static int arcturus_get_thermal_temperature_range(struct smu_context *smu,
struct smu_temperature_range *range)
{
-
PPTable_t *pptable = smu->smu_table.driver_pptable;
if (!range)
return -EINVAL;
- memcpy(range, &arcturus_thermal_policy[0], sizeof(struct smu_temperature_range));
-
range->max = pptable->TedgeLimit *
SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
range->edge_emergency_max = (pptable->TedgeLimit + CTF_OFFSET_EDGE) *
@@ -910,7 +901,6 @@ static int arcturus_get_thermal_temperature_range(struct smu_context *smu,
range->mem_emergency_max = (pptable->TmemLimit + CTF_OFFSET_HBM)*
SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
-
return 0;
}
diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
index 0a22fa48ff5a..59b2045e37e4 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
@@ -64,6 +64,12 @@
#define WORKLOAD_MAP(profile, workload) \
[profile] = {1, (workload)}
+static const struct smu_temperature_range smu11_thermal_policy[] =
+{
+ {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, 99000},
+ { 120000, 120000, 120000, 120000, 120000, 120000, 120000, 120000, 120000},
+};
+
struct smu_11_0_cmn2aisc_mapping {
int valid_mapping;
int map_to;
diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index d7d4186b762f..e804d18f61d0 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -1505,9 +1505,8 @@ static int navi10_get_thermal_temperature_range(struct smu_context *smu,
if (!range || !powerplay_table)
return -EINVAL;
- /* The unit is temperature */
- range->min = 0;
- range->max = powerplay_table->software_shutdown_temp;
+ range->max = powerplay_table->software_shutdown_temp *
+ SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
return 0;
}
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index df7b65360ac7..5f5fd3a88e48 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -1125,23 +1125,17 @@ static int smu_v11_0_get_current_clk_freq(struct smu_context *smu,
}
static int smu_v11_0_set_thermal_range(struct smu_context *smu,
- struct smu_temperature_range *range)
+ struct smu_temperature_range range)
{
struct amdgpu_device *adev = smu->adev;
int low = SMU_THERMAL_MINIMUM_ALERT_TEMP;
int high = SMU_THERMAL_MAXIMUM_ALERT_TEMP;
uint32_t val;
- if (!range)
- return -EINVAL;
-
- if (low < range->min)
- low = range->min;
- if (high > range->max)
- high = range->max;
-
- low = max(SMU_THERMAL_MINIMUM_ALERT_TEMP, range->min);
- high = min(SMU_THERMAL_MAXIMUM_ALERT_TEMP, range->max);
+ low = max(SMU_THERMAL_MINIMUM_ALERT_TEMP,
+ range.min / SMU_TEMPERATURE_UNITS_PER_CENTIGRADES);
+ high = min(SMU_THERMAL_MAXIMUM_ALERT_TEMP,
+ range.max / SMU_TEMPERATURE_UNITS_PER_CENTIGRADES);
if (low > high)
return -EINVAL;
@@ -1177,27 +1171,20 @@ static int smu_v11_0_enable_thermal_alert(struct smu_context *smu)
static int smu_v11_0_start_thermal_control(struct smu_context *smu)
{
int ret = 0;
- struct smu_temperature_range range = {
- TEMP_RANGE_MIN,
- TEMP_RANGE_MAX,
- TEMP_RANGE_MAX,
- TEMP_RANGE_MIN,
- TEMP_RANGE_MAX,
- TEMP_RANGE_MAX,
- TEMP_RANGE_MIN,
- TEMP_RANGE_MAX,
- TEMP_RANGE_MAX};
+ struct smu_temperature_range range;
struct amdgpu_device *adev = smu->adev;
if (!smu->pm_enabled)
return ret;
+ memcpy(&range, &smu11_thermal_policy[0], sizeof(struct smu_temperature_range));
+
ret = smu_get_thermal_temperature_range(smu, &range);
if (ret)
return ret;
if (smu->smu_table.thermal_controller_type) {
- ret = smu_v11_0_set_thermal_range(smu, &range);
+ ret = smu_v11_0_set_thermal_range(smu, range);
if (ret)
return ret;
@@ -1210,17 +1197,15 @@ static int smu_v11_0_start_thermal_control(struct smu_context *smu)
return ret;
}
- adev->pm.dpm.thermal.min_temp = range.min * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.max_temp = range.max * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.max_edge_emergency_temp = range.edge_emergency_max * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.min_hotspot_temp = range.hotspot_min * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.max_hotspot_crit_temp = range.hotspot_crit_max * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.max_hotspot_emergency_temp = range.hotspot_emergency_max * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.min_mem_temp = range.mem_min * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.max_mem_crit_temp = range.mem_crit_max * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.max_mem_emergency_temp = range.mem_emergency_max * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.min_temp = range.min * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
- adev->pm.dpm.thermal.max_temp = range.max * SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ adev->pm.dpm.thermal.min_temp = range.min;
+ adev->pm.dpm.thermal.max_temp = range.max;
+ adev->pm.dpm.thermal.max_edge_emergency_temp = range.edge_emergency_max;
+ adev->pm.dpm.thermal.min_hotspot_temp = range.hotspot_min;
+ adev->pm.dpm.thermal.max_hotspot_crit_temp = range.hotspot_crit_max;
+ adev->pm.dpm.thermal.max_hotspot_emergency_temp = range.hotspot_emergency_max;
+ adev->pm.dpm.thermal.min_mem_temp = range.mem_min;
+ adev->pm.dpm.thermal.max_mem_crit_temp = range.mem_crit_max;
+ adev->pm.dpm.thermal.max_mem_emergency_temp = range.mem_emergency_max;
return ret;
}
diff --git a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
index acf075393c13..e14363182691 100644
--- a/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/vega20_ppt.c
@@ -3113,14 +3113,18 @@ static int vega20_get_thermal_temperature_range(struct smu_context *smu,
if (!range || !powerplay_table)
return -EINVAL;
- /* The unit is temperature */
- range->min = 0;
- range->max = powerplay_table->usSoftwareShutdownTemp;
- range->edge_emergency_max = (pptable->TedgeLimit + CTF_OFFSET_EDGE);
- range->hotspot_crit_max = pptable->ThotspotLimit;
- range->hotspot_emergency_max = (pptable->ThotspotLimit + CTF_OFFSET_HOTSPOT);
- range->mem_crit_max = pptable->ThbmLimit;
- range->mem_emergency_max = (pptable->ThbmLimit + CTF_OFFSET_HBM);
+ range->max = powerplay_table->usSoftwareShutdownTemp *
+ SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ range->edge_emergency_max = (pptable->TedgeLimit + CTF_OFFSET_EDGE) *
+ SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ range->hotspot_crit_max = pptable->ThotspotLimit *
+ SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ range->hotspot_emergency_max = (pptable->ThotspotLimit + CTF_OFFSET_HOTSPOT) *
+ SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ range->mem_crit_max = pptable->ThbmLimit *
+ SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
+ range->mem_emergency_max = (pptable->ThbmLimit + CTF_OFFSET_HBM) *
+ SMU_TEMPERATURE_UNITS_PER_CENTIGRADES;
return 0;
--
2.22.0
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
|