-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
Sent: Monday, February 14, 2022 12:04 PM
To: Quan, Evan <Evan.Quan@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>;
rui.huang@xxxxxxx
Subject: Re: [PATCH 07/12] drm/amd/pm: correct the checks for granting gpu
reset APIs
On 2/11/2022 1:22 PM, Evan Quan wrote:
Those gpu reset APIs can be granted when:
- System is up and dpm features are enabled.
- System is under resuming and dpm features are not yet enabled.
Under such scenario, the PMFW is already alive and can support
those gpu reset functionalities.
Signed-off-by: Evan Quan <evan.quan@xxxxxxx>
Change-Id: I8c2f07138921eb53a2bd7fb94f9b3622af0eacf8
---
.../gpu/drm/amd/include/kgd_pp_interface.h | 1 +
drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 34 +++++++++++++++
.../gpu/drm/amd/pm/powerplay/amd_powerplay.c | 42
+++++++++++++++----
.../drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 1 +
.../drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c | 17 ++++++++
drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h | 1 +
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 32 +++++++-------
7 files changed, 101 insertions(+), 27 deletions(-)
diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index a4c267f15959..892648a4a353 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -409,6 +409,7 @@ struct amd_pm_funcs {
struct dpm_clocks *clock_table);
int (*get_smu_prv_buf_details)(void *handle, void **addr, size_t
*size);
void (*pm_compute_clocks)(void *handle);
+ bool (*is_smc_alive)(void *handle);
};
struct metrics_table_header {
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index b46ae0063047..5f1d3342f87b 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -120,12 +120,25 @@ int
amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev,
uint32_t block
return ret;
}
+static bool amdgpu_dpm_is_smc_alive(struct amdgpu_device *adev) {
+ const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
+
+ if (!pp_funcs || !pp_funcs->is_smc_alive)
+ return false;
+
+ return pp_funcs->is_smc_alive;
+}
+
int amdgpu_dpm_baco_enter(struct amdgpu_device *adev)
{
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
void *pp_handle = adev->powerplay.pp_handle;
int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+ return -EOPNOTSUPP;
+
if (!pp_funcs || !pp_funcs->set_asic_baco_state)
return -ENOENT;
@@ -145,6 +158,9 @@ int amdgpu_dpm_baco_exit(struct amdgpu_device
*adev)
void *pp_handle = adev->powerplay.pp_handle;
int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+ return -EOPNOTSUPP;
+
if (!pp_funcs || !pp_funcs->set_asic_baco_state)
return -ENOENT;
@@ -164,6 +180,9 @@ int amdgpu_dpm_set_mp1_state(struct
amdgpu_device *adev,
int ret = 0;
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+ return -EOPNOTSUPP;
+
if (pp_funcs && pp_funcs->set_mp1_state) {
mutex_lock(&adev->pm.mutex);
@@ -184,6 +203,9 @@ bool amdgpu_dpm_is_baco_supported(struct
amdgpu_device *adev)
bool baco_cap;
int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+ return false;
+
if (!pp_funcs || !pp_funcs->get_asic_baco_capability)
return false;
@@ -203,6 +225,9 @@ int amdgpu_dpm_mode2_reset(struct
amdgpu_device *adev)
void *pp_handle = adev->powerplay.pp_handle;
int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+ return -EOPNOTSUPP;
+
if (!pp_funcs || !pp_funcs->asic_reset_mode_2)
return -ENOENT;
@@ -221,6 +246,9 @@ int amdgpu_dpm_baco_reset(struct
amdgpu_device *adev)
void *pp_handle = adev->powerplay.pp_handle;
int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+ return -EOPNOTSUPP;
+
if (!pp_funcs || !pp_funcs->set_asic_baco_state)
return -ENOENT;
@@ -244,6 +272,9 @@ bool
amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev)
struct smu_context *smu = adev->powerplay.pp_handle;
bool support_mode1_reset = false;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+ return false;
+
if (is_support_sw_smu(adev)) {
mutex_lock(&adev->pm.mutex);
support_mode1_reset =
smu_mode1_reset_is_support(smu); @@ -258,6
+289,9 @@ int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev)
struct smu_context *smu = adev->powerplay.pp_handle;
int ret = -EOPNOTSUPP;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+ return -EOPNOTSUPP;
+
if (is_support_sw_smu(adev)) {
mutex_lock(&adev->pm.mutex);
ret = smu_mode1_reset(smu);
diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
index bba923cfe08c..4c709f7bcd51 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
@@ -844,9 +844,6 @@ static int pp_dpm_set_mp1_state(void *handle,
enum pp_mp1_state mp1_state)
if (!hwmgr)
return -EINVAL;
- if (!hwmgr->pm_en)
- return 0;
-
if (hwmgr->hwmgr_func->set_mp1_state)
return hwmgr->hwmgr_func->set_mp1_state(hwmgr,
mp1_state);
@@ -1305,8 +1302,7 @@ static int pp_get_asic_baco_capability(void
*handle, bool *cap)
if (!hwmgr)
return -EINVAL;
- if (!(hwmgr->not_vf && amdgpu_dpm) ||
- !hwmgr->hwmgr_func->get_asic_baco_capability)
+ if (!hwmgr->hwmgr_func->get_asic_baco_capability)
return 0;
hwmgr->hwmgr_func->get_asic_baco_capability(hwmgr, cap); @@ -
1321,7
+1317,7 @@ static int pp_get_asic_baco_state(void *handle, int *state)
if (!hwmgr)
return -EINVAL;
- if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state)
+ if (!hwmgr->hwmgr_func->get_asic_baco_state)
return 0;
hwmgr->hwmgr_func->get_asic_baco_state(hwmgr, (enum
BACO_STATE
*)state); @@ -1336,8 +1332,7 @@ static int pp_set_asic_baco_state(void
*handle, int state)
if (!hwmgr)
return -EINVAL;
- if (!(hwmgr->not_vf && amdgpu_dpm) ||
- !hwmgr->hwmgr_func->set_asic_baco_state)
+ if (!hwmgr->hwmgr_func->set_asic_baco_state)
return 0;
hwmgr->hwmgr_func->set_asic_baco_state(hwmgr, (enum
BACO_STATE)state); @@ -1379,7 +1374,7 @@ static int
pp_asic_reset_mode_2(void *handle)
{
struct pp_hwmgr *hwmgr = handle;
- if (!hwmgr || !hwmgr->pm_en)
+ if (!hwmgr)
return -EINVAL;
if (hwmgr->hwmgr_func->asic_reset == NULL) { @@ -1517,6
+1512,34 @@
static void pp_pm_compute_clocks(void *handle)
NULL);
}
+/* MP Apertures */
+#define MP1_Public 0x03b00000
+#define smnMP1_FIRMWARE_FLAGS
0x3010028
+#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK
0x00000001L
+
+static bool pp_is_smc_alive(void *handle) {
+ struct pp_hwmgr *hwmgr = handle;
+ struct amdgpu_device *adev = hwmgr->adev;
+ uint32_t mp1_fw_flags;
+
+ /*
+ * If some ASIC(e.g. smu7/smu8) needs special handling for
+ * checking smc alive, it should have its own implementation
+ * for ->is_smc_alive.
+ */
+ if (hwmgr->hwmgr_func->is_smc_alive)
+ return hwmgr->hwmgr_func->is_smc_alive(hwmgr);
+
+ mp1_fw_flags = RREG32_PCIE(MP1_Public |
+ (smnMP1_FIRMWARE_FLAGS & 0xffffffff));
+
The flags check doesn't tell whether PMFW is hung or not. It is a minimal
thing that is set after PMFW boot. To call the API this condition is necessary in
an implicit way. Driver always check this on boot, if not driver aborts smu init.
So better thing is to go ahead and send the message without any check, it will
tell the result whether PMFW is really working or not.
In short this API is not needed.