Re: [PATCH 07/12] drm/amd/pm: correct the checks for granting gpu reset APIs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 2/11/2022 1:22 PM, Evan Quan wrote:
Those gpu reset APIs can be granted when:
   - System is up and dpm features are enabled.
   - System is under resuming and dpm features are not yet enabled.
     Under such scenario, the PMFW is already alive and can support
     those gpu reset functionalities.

Signed-off-by: Evan Quan <evan.quan@xxxxxxx>
Change-Id: I8c2f07138921eb53a2bd7fb94f9b3622af0eacf8
---
  .../gpu/drm/amd/include/kgd_pp_interface.h    |  1 +
  drivers/gpu/drm/amd/pm/amdgpu_dpm.c           | 34 +++++++++++++++
  .../gpu/drm/amd/pm/powerplay/amd_powerplay.c  | 42 +++++++++++++++----
  .../drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c   |  1 +
  .../drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c   | 17 ++++++++
  drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h  |  1 +
  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 32 +++++++-------
  7 files changed, 101 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
index a4c267f15959..892648a4a353 100644
--- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
@@ -409,6 +409,7 @@ struct amd_pm_funcs {
  				   struct dpm_clocks *clock_table);
  	int (*get_smu_prv_buf_details)(void *handle, void **addr, size_t *size);
  	void (*pm_compute_clocks)(void *handle);
+	bool (*is_smc_alive)(void *handle);
  };
struct metrics_table_header {
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index b46ae0063047..5f1d3342f87b 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -120,12 +120,25 @@ int amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev, uint32_t block
  	return ret;
  }
+static bool amdgpu_dpm_is_smc_alive(struct amdgpu_device *adev)
+{
+	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
+
+	if (!pp_funcs || !pp_funcs->is_smc_alive)
+		return false;
+
+	return pp_funcs->is_smc_alive;
+}
+
  int amdgpu_dpm_baco_enter(struct amdgpu_device *adev)
  {
  	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
  	void *pp_handle = adev->powerplay.pp_handle;
  	int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
  	if (!pp_funcs || !pp_funcs->set_asic_baco_state)
  		return -ENOENT;
@@ -145,6 +158,9 @@ int amdgpu_dpm_baco_exit(struct amdgpu_device *adev)
  	void *pp_handle = adev->powerplay.pp_handle;
  	int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
  	if (!pp_funcs || !pp_funcs->set_asic_baco_state)
  		return -ENOENT;
@@ -164,6 +180,9 @@ int amdgpu_dpm_set_mp1_state(struct amdgpu_device *adev,
  	int ret = 0;
  	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
  	if (pp_funcs && pp_funcs->set_mp1_state) {
  		mutex_lock(&adev->pm.mutex);
@@ -184,6 +203,9 @@ bool amdgpu_dpm_is_baco_supported(struct amdgpu_device *adev)
  	bool baco_cap;
  	int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+		return false;
+
  	if (!pp_funcs || !pp_funcs->get_asic_baco_capability)
  		return false;
@@ -203,6 +225,9 @@ int amdgpu_dpm_mode2_reset(struct amdgpu_device *adev)
  	void *pp_handle = adev->powerplay.pp_handle;
  	int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
  	if (!pp_funcs || !pp_funcs->asic_reset_mode_2)
  		return -ENOENT;
@@ -221,6 +246,9 @@ int amdgpu_dpm_baco_reset(struct amdgpu_device *adev)
  	void *pp_handle = adev->powerplay.pp_handle;
  	int ret = 0;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
  	if (!pp_funcs || !pp_funcs->set_asic_baco_state)
  		return -ENOENT;
@@ -244,6 +272,9 @@ bool amdgpu_dpm_is_mode1_reset_supported(struct amdgpu_device *adev)
  	struct smu_context *smu = adev->powerplay.pp_handle;
  	bool support_mode1_reset = false;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+		return false;
+
  	if (is_support_sw_smu(adev)) {
  		mutex_lock(&adev->pm.mutex);
  		support_mode1_reset = smu_mode1_reset_is_support(smu);
@@ -258,6 +289,9 @@ int amdgpu_dpm_mode1_reset(struct amdgpu_device *adev)
  	struct smu_context *smu = adev->powerplay.pp_handle;
  	int ret = -EOPNOTSUPP;
+ if (!amdgpu_dpm_is_smc_alive(adev))
+		return -EOPNOTSUPP;
+
  	if (is_support_sw_smu(adev)) {
  		mutex_lock(&adev->pm.mutex);
  		ret = smu_mode1_reset(smu);
diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
index bba923cfe08c..4c709f7bcd51 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
@@ -844,9 +844,6 @@ static int pp_dpm_set_mp1_state(void *handle, enum pp_mp1_state mp1_state)
  	if (!hwmgr)
  		return -EINVAL;
- if (!hwmgr->pm_en)
-		return 0;
-
  	if (hwmgr->hwmgr_func->set_mp1_state)
  		return hwmgr->hwmgr_func->set_mp1_state(hwmgr, mp1_state);
@@ -1305,8 +1302,7 @@ static int pp_get_asic_baco_capability(void *handle, bool *cap)
  	if (!hwmgr)
  		return -EINVAL;
- if (!(hwmgr->not_vf && amdgpu_dpm) ||
-		!hwmgr->hwmgr_func->get_asic_baco_capability)
+	if (!hwmgr->hwmgr_func->get_asic_baco_capability)
  		return 0;
hwmgr->hwmgr_func->get_asic_baco_capability(hwmgr, cap);
@@ -1321,7 +1317,7 @@ static int pp_get_asic_baco_state(void *handle, int *state)
  	if (!hwmgr)
  		return -EINVAL;
- if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state)
+	if (!hwmgr->hwmgr_func->get_asic_baco_state)
  		return 0;
hwmgr->hwmgr_func->get_asic_baco_state(hwmgr, (enum BACO_STATE *)state);
@@ -1336,8 +1332,7 @@ static int pp_set_asic_baco_state(void *handle, int state)
  	if (!hwmgr)
  		return -EINVAL;
- if (!(hwmgr->not_vf && amdgpu_dpm) ||
-		!hwmgr->hwmgr_func->set_asic_baco_state)
+	if (!hwmgr->hwmgr_func->set_asic_baco_state)
  		return 0;
hwmgr->hwmgr_func->set_asic_baco_state(hwmgr, (enum BACO_STATE)state);
@@ -1379,7 +1374,7 @@ static int pp_asic_reset_mode_2(void *handle)
  {
  	struct pp_hwmgr *hwmgr = handle;
- if (!hwmgr || !hwmgr->pm_en)
+	if (!hwmgr)
  		return -EINVAL;
if (hwmgr->hwmgr_func->asic_reset == NULL) {
@@ -1517,6 +1512,34 @@ static void pp_pm_compute_clocks(void *handle)
  			      NULL);
  }
+/* MP Apertures */
+#define MP1_Public					0x03b00000
+#define smnMP1_FIRMWARE_FLAGS				0x3010028
+#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK	0x00000001L
+
+static bool pp_is_smc_alive(void *handle)
+{
+	struct pp_hwmgr *hwmgr = handle;
+	struct amdgpu_device *adev = hwmgr->adev;
+	uint32_t mp1_fw_flags;
+
+	/*
+	 * If some ASIC(e.g. smu7/smu8) needs special handling for
+	 * checking smc alive, it should have its own implementation
+	 * for ->is_smc_alive.
+	 */
+	if (hwmgr->hwmgr_func->is_smc_alive)
+		return hwmgr->hwmgr_func->is_smc_alive(hwmgr);
+
+	mp1_fw_flags = RREG32_PCIE(MP1_Public |
+				   (smnMP1_FIRMWARE_FLAGS & 0xffffffff));
+

The flags check doesn't tell whether PMFW is hung or not. It is a minimal thing that is set after PMFW boot. To call the API this condition is necessary in an implicit way. Driver always check this on boot, if not driver aborts smu init.

So better thing is to go ahead and send the message without any check, it will tell the result whether PMFW is really working or not.

In short this API is not needed.

Thanks,
Lijo

+	if (mp1_fw_flags & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK)
+		return true;
+
+	return false;
+}
+
  static const struct amd_pm_funcs pp_dpm_funcs = {
  	.load_firmware = pp_dpm_load_fw,
  	.wait_for_fw_loading_complete = pp_dpm_fw_loading_complete,
@@ -1582,4 +1605,5 @@ static const struct amd_pm_funcs pp_dpm_funcs = {
  	.gfx_state_change_set = pp_gfx_state_change_set,
  	.get_smu_prv_buf_details = pp_get_prv_buffer_details,
  	.pm_compute_clocks = pp_pm_compute_clocks,
+	.is_smc_alive = pp_is_smc_alive,
  };
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
index a1e11037831a..118039b96524 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
@@ -5735,6 +5735,7 @@ static const struct pp_hwmgr_func smu7_hwmgr_funcs = {
  	.get_asic_baco_state = smu7_baco_get_state,
  	.set_asic_baco_state = smu7_baco_set_state,
  	.power_off_asic = smu7_power_off_asic,
+	.is_smc_alive = smu7_is_smc_ram_running,
  };
uint8_t smu7_get_sleep_divider_id_from_clock(uint32_t clock,
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
index b50fd4a4a3d1..fc4d58329f6d 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu8_hwmgr.c
@@ -2015,6 +2015,22 @@ static void smu8_dpm_powergate_vce(struct pp_hwmgr *hwmgr, bool bgate)
  	}
  }
+#define ixMP1_FIRMWARE_FLAGS 0x3008210
+#define MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK		0x00000001L
+
+static bool smu8_is_smc_running(struct pp_hwmgr *hwmgr)
+{
+	struct amdgpu_device *adev = hwmgr->adev;
+	uint32_t mp1_fw_flags;
+
+	mp1_fw_flags = RREG32_SMC(ixMP1_FIRMWARE_FLAGS);
+
+	if (mp1_fw_flags & MP1_FIRMWARE_FLAGS__INTERRUPTS_ENABLED_MASK)
+		return true;
+
+	return false;
+}
+
  static const struct pp_hwmgr_func smu8_hwmgr_funcs = {
  	.backend_init = smu8_hwmgr_backend_init,
  	.backend_fini = smu8_hwmgr_backend_fini,
@@ -2047,6 +2063,7 @@ static const struct pp_hwmgr_func smu8_hwmgr_funcs = {
  	.dynamic_state_management_disable = smu8_disable_dpm_tasks,
  	.notify_cac_buffer_info = smu8_notify_cac_buffer_info,
  	.get_thermal_temperature_range = smu8_get_thermal_temperature_range,
+	.is_smc_alive = smu8_is_smc_running,
  };
int smu8_init_function_pointers(struct pp_hwmgr *hwmgr)
diff --git a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
index 4f7f2f455301..790fc387752c 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
+++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
@@ -364,6 +364,7 @@ struct pp_hwmgr_func {
  					bool disable);
  	ssize_t (*get_gpu_metrics)(struct pp_hwmgr *hwmgr, void **table);
  	int (*gfx_state_change)(struct pp_hwmgr *hwmgr, uint32_t state);
+	bool (*is_smc_alive)(struct pp_hwmgr *hwmgr);
  };
struct pp_table_func {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 8b8feaf7aa0e..27a453fb4db7 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1845,9 +1845,6 @@ static int smu_set_mp1_state(void *handle,
  	struct smu_context *smu = handle;
  	int ret = 0;
- if (!smu->pm_enabled)
-		return -EOPNOTSUPP;
-
  	if (smu->ppt_funcs &&
  	    smu->ppt_funcs->set_mp1_state)
  		ret = smu->ppt_funcs->set_mp1_state(smu, mp1_state);
@@ -2513,9 +2510,6 @@ static int smu_get_baco_capability(void *handle, bool *cap)
*cap = false; - if (!smu->pm_enabled)
-		return 0;
-
  	if (smu->ppt_funcs && smu->ppt_funcs->baco_is_support)
  		*cap = smu->ppt_funcs->baco_is_support(smu);
@@ -2527,9 +2521,6 @@ static int smu_baco_set_state(void *handle, int state)
  	struct smu_context *smu = handle;
  	int ret = 0;
- if (!smu->pm_enabled)
-		return -EOPNOTSUPP;
-
  	if (state == 0) {
  		if (smu->ppt_funcs->baco_exit)
  			ret = smu->ppt_funcs->baco_exit(smu);
@@ -2551,9 +2542,6 @@ bool smu_mode1_reset_is_support(struct smu_context *smu)
  {
  	bool ret = false;
- if (!smu->pm_enabled)
-		return false;
-
  	if (smu->ppt_funcs && smu->ppt_funcs->mode1_reset_is_support)
  		ret = smu->ppt_funcs->mode1_reset_is_support(smu);
@@ -2564,9 +2552,6 @@ int smu_mode1_reset(struct smu_context *smu)
  {
  	int ret = 0;
- if (!smu->pm_enabled)
-		return -EOPNOTSUPP;
-
  	if (smu->ppt_funcs->mode1_reset)
  		ret = smu->ppt_funcs->mode1_reset(smu);
@@ -2578,9 +2563,6 @@ static int smu_mode2_reset(void *handle)
  	struct smu_context *smu = handle;
  	int ret = 0;
- if (!smu->pm_enabled)
-		return -EOPNOTSUPP;
-
  	if (smu->ppt_funcs->mode2_reset)
  		ret = smu->ppt_funcs->mode2_reset(smu);
@@ -2712,6 +2694,19 @@ static int smu_get_prv_buffer_details(void *handle, void **addr, size_t *size)
  	return 0;
  }
+static bool smu_is_smc_alive(void *handle)
+{
+	struct smu_context *smu = handle;
+
+	if (!smu->ppt_funcs->check_fw_status)
+		return false;
+
+	if (!smu->ppt_funcs->check_fw_status(smu))
+		return true;
+
+	return false;
+}
+
  static const struct amd_pm_funcs swsmu_pm_funcs = {
  	/* export for sysfs */
  	.set_fan_control_mode    = smu_set_fan_control_mode,
@@ -2765,6 +2760,7 @@ static const struct amd_pm_funcs swsmu_pm_funcs = {
  	.get_uclk_dpm_states              = smu_get_uclk_dpm_states,
  	.get_dpm_clock_table              = smu_get_dpm_clock_table,
  	.get_smu_prv_buf_details = smu_get_prv_buffer_details,
+	.is_smc_alive = smu_is_smc_alive,
  };
int smu_wait_for_event(struct smu_context *smu, enum smu_event_type event,




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux