Re: [PATCH 07/12] drm/amd/pm: correct the checks for granting gpu reset APIs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 2/17/2022 8:18 AM, Quan, Evan wrote:
[AMD Official Use Only]



-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
Sent: Monday, February 14, 2022 12:04 PM
To: Quan, Evan <Evan.Quan@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>;
rui.huang@xxxxxxx
Subject: Re: [PATCH 07/12] drm/amd/pm: correct the checks for granting gpu
reset APIs



On 2/11/2022 1:22 PM, Evan Quan wrote:
Those gpu reset APIs can be granted when:
    - System is up and dpm features are enabled.
    - System is under resuming and dpm features are not yet enabled.
      Under such scenario, the PMFW is already alive and can support
      those gpu reset functionalities.

Signed-off-by: Evan Quan <evan.quan@xxxxxxx>
Change-Id: I8c2f07138921eb53a2bd7fb94f9b3622af0eacf8
---
   .../gpu/drm/amd/include/kgd_pp_interface.h    |  1 +
   drivers/gpu/drm/amd/pm/amdgpu_dpm.c           | 34 +++++++++++++++
   .../gpu/drm/amd/pm/powerplay/amd_powerplay.c  | 42
+++++++++++++++----
   .../drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c   |  1 +
   .../drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c   | 17 ++++++++
   drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h  |  1 +
   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 32 +++++++-------
   7 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index a4c267f15959..892648a4a353 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -409,6 +409,7 @@ struct amd_pm_funcs {
   				   struct dpm_clocks *clock_table);
   	int (*get_smu_prv_buf_details)(void *handle, void **addr, size_t
*size);
   	void (*pm_compute_clocks)(void *handle);
+	bool (*is_smc_alive)(void *handle);
   };

   struct metrics_table_header {
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index b46ae0063047..5f1d3342f87b 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -120,12 +120,25 @@ int
amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev,
uint32_t block
   	return ret;
   }

+static bool amdgpu_dpm_is_smc_alive(struct amdgpu_device *adev) {
+	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
+
+	if (!pp_funcs || !pp_funcs->is_smc_alive)
+		return false;
+
+	return pp_funcs->is_smc_alive;
+}
+
   int amdgpu_dpm_baco_enter(struct amdgpu_device *adev)
   {
   	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
   	void *pp_handle = adev->powerplay.pp_handle;
   	int ret = 0;

+	if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
   	if (!pp_funcs || !pp_funcs->set_asic_baco_state)
   		return -ENOENT;

@@ -145,6 +158,9 @@ int amdgpu_dpm_baco_exit(struct amdgpu_device
*adev)
   	void *pp_handle = adev->powerplay.pp_handle;
   	int ret = 0;

+	if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
   	if (!pp_funcs || !pp_funcs->set_asic_baco_state)
   		return -ENOENT;

@@ -164,6 +180,9 @@ int amdgpu_dpm_set_mp1_state(struct
amdgpu_device *adev,
   	int ret = 0;
   	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;

+	if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
   	if (pp_funcs && pp_funcs->set_mp1_state) {
   		mutex_lock(&adev->pm.mutex);

@@ -184,6 +203,9 @@ bool amdgpu_dpm_is_baco_supported(struct
amdgpu_device *adev)
   	bool baco_cap;
   	int ret = 0;

+	if (!amdgpu_dpm_is_smc_alive(adev))
+		return false;
+
   	if (!pp_funcs || !pp_funcs->get_asic_baco_capability)
   		return false;

@@ -203,6 +225,9 @@ int amdgpu_dpm_mode2_reset(struct
amdgpu_device *adev)
   	void *pp_handle = adev->powerplay.pp_handle;
   	int ret = 0;

+	if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
   	if (!pp_funcs || !pp_funcs->asic_reset_mode_2)
   		return -ENOENT;

@@ -221,6 +246,9 @@ int amdgpu_dpm_baco_reset(struct
amdgpu_device *adev)
   	void *pp_handle = adev->powerplay.pp_handle;
   	int ret = 0;

+	if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
   	if (!pp_funcs || !pp_funcs->set_asic_baco_state)
   		return -ENOENT;

@@ -244,6 +272,9 @@ bool
amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev)
   	struct smu_context *smu = adev->powerplay.pp_handle;
   	bool support_mode1_reset = false;

+	if (!amdgpu_dpm_is_smc_alive(adev))
+		return false;
+
   	if (is_support_sw_smu(adev)) {
   		mutex_lock(&adev->pm.mutex);
   		support_mode1_reset =
smu_mode1_reset_is_support(smu); @@ -258,6
+289,9 @@ int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev)
   	struct smu_context *smu = adev->powerplay.pp_handle;
   	int ret = -EOPNOTSUPP;

+	if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
   	if (is_support_sw_smu(adev)) {
   		mutex_lock(&adev->pm.mutex);
   		ret = smu_mode1_reset(smu);
diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
index bba923cfe08c..4c709f7bcd51 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
@@ -844,9 +844,6 @@ static int pp_dpm_set_mp1_state(void *handle,
enum pp_mp1_state mp1_state)
   	if (!hwmgr)
   		return -EINVAL;

-	if (!hwmgr->pm_en)
-		return 0;
-
   	if (hwmgr->hwmgr_func->set_mp1_state)
   		return hwmgr->hwmgr_func->set_mp1_state(hwmgr,
mp1_state);

@@ -1305,8 +1302,7 @@ static int pp_get_asic_baco_capability(void
*handle, bool *cap)
   	if (!hwmgr)
   		return -EINVAL;

-	if (!(hwmgr->not_vf && amdgpu_dpm) ||
-		!hwmgr->hwmgr_func->get_asic_baco_capability)
+	if (!hwmgr->hwmgr_func->get_asic_baco_capability)
   		return 0;

   	hwmgr->hwmgr_func->get_asic_baco_capability(hwmgr, cap); @@ -
1321,7
+1317,7 @@ static int pp_get_asic_baco_state(void *handle, int *state)
   	if (!hwmgr)
   		return -EINVAL;

-	if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state)
+	if (!hwmgr->hwmgr_func->get_asic_baco_state)
   		return 0;

   	hwmgr->hwmgr_func->get_asic_baco_state(hwmgr, (enum
BACO_STATE
*)state); @@ -1336,8 +1332,7 @@ static int pp_set_asic_baco_state(void
*handle, int state)
   	if (!hwmgr)
   		return -EINVAL;

-	if (!(hwmgr->not_vf && amdgpu_dpm) ||
-		!hwmgr->hwmgr_func->set_asic_baco_state)
+	if (!hwmgr->hwmgr_func->set_asic_baco_state)
   		return 0;

   	hwmgr->hwmgr_func->set_asic_baco_state(hwmgr, (enum
BACO_STATE)state); @@ -1379,7 +1374,7 @@ static int
pp_asic_reset_mode_2(void *handle)
   {
   	struct pp_hwmgr *hwmgr = handle;

-	if (!hwmgr || !hwmgr->pm_en)
+	if (!hwmgr)
   		return -EINVAL;

   	if (hwmgr->hwmgr_func->asic_reset == NULL) { @@ -1517,6
+1512,34 @@
static void pp_pm_compute_clocks(void *handle)
   			      NULL);
   }

+/* MP Apertures */
+#define MP1_Public					0x03b00000
+#define smnMP1_FIRMWARE_FLAGS
	0x3010028
+#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK
	0x00000001L
+
+static bool pp_is_smc_alive(void *handle) {
+	struct pp_hwmgr *hwmgr = handle;
+	struct amdgpu_device *adev = hwmgr->adev;
+	uint32_t mp1_fw_flags;
+
+	/*
+	 * If some ASIC(e.g. smu7/smu8) needs special handling for
+	 * checking smc alive, it should have its own implementation
+	 * for ->is_smc_alive.
+	 */
+	if (hwmgr->hwmgr_func->is_smc_alive)
+		return hwmgr->hwmgr_func->is_smc_alive(hwmgr);
+
+	mp1_fw_flags = RREG32_PCIE(MP1_Public |
+				   (smnMP1_FIRMWARE_FLAGS & 0xffffffff));
+

The flags check doesn't tell whether PMFW is hung or not. It is a minimal
thing that is set after PMFW boot. To call the API this condition is necessary in
an implicit way. Driver always check this on boot, if not driver aborts smu init.

So better thing is to go ahead and send the message without any check, it will
tell the result whether PMFW is really working or not.

In short this API is not needed.
[Quan, Evan] It was not designed to cover "PMFW hung". Instead, it was designed to be support early phase of post-silicon bringup.
At that time, the SMU may be not enabled/up. We need to prevent this API from wrongly called.


One of the first things done, atleast in swsmu, is hw_init/resume -> smu_start_smc_engine -> check_fw_status.

If smu is not up/enabled, this call shouldn't even happen as init itself will fail.

Thanks,
Lijo

BR
Evan

Thanks,
Lijo

+	if (mp1_fw_flags &
MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK)
+		return true;
+
+	return false;
+}
+
   static const struct amd_pm_funcs pp_dpm_funcs = {
   	.load_firmware = pp_dpm_load_fw,
   	.wait_for_fw_loading_complete = pp_dpm_fw_loading_complete,
@@
-1582,4 +1605,5 @@ static const struct amd_pm_funcs pp_dpm_funcs = {
   	.gfx_state_change_set = pp_gfx_state_change_set,
   	.get_smu_prv_buf_details = pp_get_prv_buffer_details,
   	.pm_compute_clocks = pp_pm_compute_clocks,
+	.is_smc_alive = pp_is_smc_alive,
   };
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
index a1e11037831a..118039b96524 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
@@ -5735,6 +5735,7 @@ static const struct pp_hwmgr_func
smu7_hwmgr_funcs = {
   	.get_asic_baco_state = smu7_baco_get_state,
   	.set_asic_baco_state = smu7_baco_set_state,
   	.power_off_asic = smu7_power_off_asic,
+	.is_smc_alive = smu7_is_smc_ram_running,
   };

   uint8_t smu7_get_sleep_divider_id_from_clock(uint32_t clock, diff
--git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
index b50fd4a4a3d1..fc4d58329f6d 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
@@ -2015,6 +2015,22 @@ static void smu8_dpm_powergate_vce(struct
pp_hwmgr *hwmgr, bool bgate)
   	}
   }

+#define ixMP1_FIRMWARE_FLAGS
	0x3008210
+#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK
	0x00000001L
+
+static bool smu8_is_smc_running(struct pp_hwmgr *hwmgr) {
+	struct amdgpu_device *adev = hwmgr->adev;
+	uint32_t mp1_fw_flags;
+
+	mp1_fw_flags = RREG32_SMC(ixMP1_FIRMWARE_FLAGS);
+
+	if (mp1_fw_flags &
MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK)
+		return true;
+
+	return false;
+}
+
   static const struct pp_hwmgr_func smu8_hwmgr_funcs = {
   	.backend_init = smu8_hwmgr_backend_init,
   	.backend_fini = smu8_hwmgr_backend_fini, @@ -2047,6 +2063,7
@@
static const struct pp_hwmgr_func smu8_hwmgr_funcs = {
   	.dynamic_state_management_disable = smu8_disable_dpm_tasks,
   	.notify_cac_buffer_info = smu8_notify_cac_buffer_info,
   	.get_thermal_temperature_range =
smu8_get_thermal_temperature_range,
+	.is_smc_alive = smu8_is_smc_running,
   };

   int smu8_init_function_pointers(struct pp_hwmgr *hwmgr) diff --git
a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
index 4f7f2f455301..790fc387752c 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
+++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
@@ -364,6 +364,7 @@ struct pp_hwmgr_func {
   					bool disable);
   	ssize_t (*get_gpu_metrics)(struct pp_hwmgr *hwmgr, void **table);
   	int (*gfx_state_change)(struct pp_hwmgr *hwmgr, uint32_t state);
+	bool (*is_smc_alive)(struct pp_hwmgr *hwmgr);
   };

   struct pp_table_func {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 8b8feaf7aa0e..27a453fb4db7 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1845,9 +1845,6 @@ static int smu_set_mp1_state(void *handle,
   	struct smu_context *smu = handle;
   	int ret = 0;

-	if (!smu->pm_enabled)
-		return -EOPNOTSUPP;
-
   	if (smu->ppt_funcs &&
   	    smu->ppt_funcs->set_mp1_state)
   		ret = smu->ppt_funcs->set_mp1_state(smu, mp1_state);
@@ -2513,9
+2510,6 @@ static int smu_get_baco_capability(void *handle, bool *cap)

   	*cap = false;

-	if (!smu->pm_enabled)
-		return 0;
-
   	if (smu->ppt_funcs && smu->ppt_funcs->baco_is_support)
   		*cap = smu->ppt_funcs->baco_is_support(smu);

@@ -2527,9 +2521,6 @@ static int smu_baco_set_state(void *handle, int
state)
   	struct smu_context *smu = handle;
   	int ret = 0;

-	if (!smu->pm_enabled)
-		return -EOPNOTSUPP;
-
   	if (state == 0) {
   		if (smu->ppt_funcs->baco_exit)
   			ret = smu->ppt_funcs->baco_exit(smu); @@ -2551,9
+2542,6 @@ bool
smu_mode1_reset_is_support(struct smu_context *smu)
   {
   	bool ret = false;

-	if (!smu->pm_enabled)
-		return false;
-
   	if (smu->ppt_funcs && smu->ppt_funcs->mode1_reset_is_support)
   		ret = smu->ppt_funcs->mode1_reset_is_support(smu);

@@ -2564,9 +2552,6 @@ int smu_mode1_reset(struct smu_context *smu)
   {
   	int ret = 0;

-	if (!smu->pm_enabled)
-		return -EOPNOTSUPP;
-
   	if (smu->ppt_funcs->mode1_reset)
   		ret = smu->ppt_funcs->mode1_reset(smu);

@@ -2578,9 +2563,6 @@ static int smu_mode2_reset(void *handle)
   	struct smu_context *smu = handle;
   	int ret = 0;

-	if (!smu->pm_enabled)
-		return -EOPNOTSUPP;
-
   	if (smu->ppt_funcs->mode2_reset)
   		ret = smu->ppt_funcs->mode2_reset(smu);

@@ -2712,6 +2694,19 @@ static int smu_get_prv_buffer_details(void
*handle, void **addr, size_t *size)
   	return 0;
   }

+static bool smu_is_smc_alive(void *handle) {
+	struct smu_context *smu = handle;
+
+	if (!smu->ppt_funcs->check_fw_status)
+		return false;
+
+	if (!smu->ppt_funcs->check_fw_status(smu))
+		return true;
+
+	return false;
+}
+
   static const struct amd_pm_funcs swsmu_pm_funcs = {
   	/* export for sysfs */
   	.set_fan_control_mode    = smu_set_fan_control_mode,
@@ -2765,6 +2760,7 @@ static const struct amd_pm_funcs
swsmu_pm_funcs = {
   	.get_uclk_dpm_states              = smu_get_uclk_dpm_states,
   	.get_dpm_clock_table              = smu_get_dpm_clock_table,
   	.get_smu_prv_buf_details = smu_get_prv_buffer_details,
+	.is_smc_alive = smu_is_smc_alive,
   };

   int smu_wait_for_event(struct smu_context *smu, enum
smu_event_type
event,




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux