Re: [PATCH] drm/amd: Don't wake dGPUs while reading sensors

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 8/23/2024 09:31, Alex Deucher wrote:
On Fri, Aug 23, 2024 at 10:13 AM Mario Limonciello
<mario.limonciello@xxxxxxx> wrote:

On 8/23/2024 09:09, Alex Deucher wrote:
On Mon, Aug 19, 2024 at 10:30 PM Mario Limonciello <superm1@xxxxxxxxxx> wrote:

From: Mario Limonciello <mario.limonciello@xxxxxxx>

If the dGPU is off, then reading the sysfs files with a sensor monitoring
application will wake it. Change the behavior to return an error when the
dGPU is in D3cold.

I'm a little concerned that this will generate a flurry of bug reports
if this now reports an error.  One more comment below.


Do you have a particular app you're worried about, or just a general
worry?  I've had a lot of people reach out to me complaining about
battery life on A+A systems, and it comes down to the use of sensor
monitoring software waking the dGPU which people don't seem to expect.

Nothing in particular.  Just expecting reports of "I checked my GPU
temperature and it returned an error.  It was working with the last
kernel."


I did double check that software like 'sensors', 'mission center' and
'nvtop' don't freak out from this change.

Do we know what other devices do?

I'm not sure.  Let me CC Luke Jones, he might know.

I wonder if it would make more
sense to have the userspace tools check the runpm state before trying
to access the device.  Of course there are a lot of them and fixing
them all is non-trivial...

Yes that's another way to go about it. But I've raised a bug with mission center folks 8 months ago [1] to tell them to stop querying dGPUs in runtime PM, and Luke Jones also raised it with them 4 months earlier [2] but it's still not sorted in their project.

[1] https://gitlab.com/mission-center-devs/mission-center/-/issues/143
[2] https://gitlab.com/mission-center-devs/mission-center/-/issues/30

I suspect we'll hit similar road blocks with the dozens of other softwares that do this. So that's why I was thinking cut off the snake's head.


Alex


Here is what 'sensors' shows on my local workstation with this change.

amdgpu-pci-6100
Adapter: PCI adapter
vddgfx:           N/A
ERROR: Can't get value of subfeature fan1_min: Can't read
ERROR: Can't get value of subfeature fan1_max: Can't read
fan1:             N/A  (min =    0 RPM, max =    0 RPM)
edge:             N/A  (crit = +97.0°C, hyst = -273.1°C)
ERROR: Can't get value of subfeature power1_cap: Can't read
PPT:              N/A  (cap =   0.00 W)


Signed-off-by: Mario Limonciello <mario.limonciello@xxxxxxx>
---
   drivers/gpu/drm/amd/pm/amdgpu_pm.c | 90 +++++++++++++++---------------
   1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index c11952a4389bc..d6e38466fbb82 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -142,7 +142,7 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -173,7 +173,7 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          if (strncmp("battery", buf, strlen("battery")) == 0)
@@ -270,7 +270,7 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -309,7 +309,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          if (strncmp("low", buf, strlen("low")) == 0) {
@@ -371,7 +371,7 @@ static ssize_t amdgpu_get_pp_num_states(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -409,7 +409,7 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -448,7 +448,7 @@ static ssize_t amdgpu_get_pp_force_state(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          if (adev->pm.pp_force_state_enabled)
@@ -471,7 +471,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          adev->pm.pp_force_state_enabled = false;
@@ -541,7 +541,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -577,7 +577,7 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -760,7 +760,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          if (count > 127 || count == 0)
@@ -862,7 +862,7 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -922,7 +922,7 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = kstrtou64(buf, 0, &featuremask);
@@ -957,7 +957,7 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -1026,7 +1026,7 @@ static ssize_t amdgpu_get_pp_dpm_clock(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -1095,7 +1095,7 @@ static ssize_t amdgpu_set_pp_dpm_clock(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = amdgpu_read_mask(buf, count, &mask);
@@ -1280,7 +1280,7 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -1309,7 +1309,7 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = kstrtol(buf, 0, &value);
@@ -1342,7 +1342,7 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -1371,7 +1371,7 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = kstrtol(buf, 0, &value);
@@ -1424,7 +1424,7 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -1463,7 +1463,7 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          tmp[0] = *(buf);
@@ -1517,7 +1517,7 @@ static int amdgpu_hwmon_get_sensor_generic(struct amdgpu_device *adev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
@@ -1630,7 +1630,7 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          if (adev->flags & AMD_IS_APU)
@@ -1673,7 +1673,7 @@ static ssize_t amdgpu_get_unique_id(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          if (adev->unique_id)
@@ -1846,7 +1846,7 @@ static ssize_t amdgpu_get_pm_metrics(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -1887,7 +1887,7 @@ static ssize_t amdgpu_get_gpu_metrics(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(ddev->dev);
@@ -2005,7 +2005,7 @@ static ssize_t amdgpu_set_smartshift_bias(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          r = pm_runtime_get_sync(ddev->dev);
@@ -2227,7 +2227,7 @@ static ssize_t amdgpu_get_xgmi_plpd_policy(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          mode = amdgpu_dpm_get_xgmi_plpd_mode(adev, &mode_desc);
@@ -2250,7 +2250,7 @@ static ssize_t amdgpu_set_xgmi_plpd_policy(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = kstrtos32(buf, 0, &mode);
@@ -2652,7 +2652,7 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
@@ -2684,7 +2684,7 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          err = kstrtoint(buf, 10, &value);
@@ -2742,7 +2742,7 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          err = kstrtou32(buf, 10, &value);
@@ -2787,7 +2787,7 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
@@ -2817,7 +2817,7 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
@@ -2881,7 +2881,7 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
@@ -2912,7 +2912,7 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          err = kstrtou32(buf, 10, &value);
@@ -2956,7 +2956,7 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
@@ -2988,7 +2988,7 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          err = kstrtoint(buf, 10, &value);
@@ -3128,7 +3128,7 @@ static ssize_t amdgpu_hwmon_show_power_cap_generic(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
@@ -3209,7 +3209,7 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          if (amdgpu_sriov_vf(adev))
@@ -3663,7 +3663,7 @@ static int amdgpu_retrieve_od_settings(struct amdgpu_device *adev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = pm_runtime_get_sync(adev->dev);
@@ -3747,7 +3747,7 @@ amdgpu_distribute_custom_od_settings(struct amdgpu_device *adev,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = parse_input_od_command_lines(in_buf,
@@ -4626,7 +4626,7 @@ static int amdgpu_debugfs_pm_info_show(struct seq_file *m, void *unused)

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

I'd prefer to keep the current behavior for debugfs.

OK.  I'll exclude it for debugfs in the next spin.


Alex


          r = pm_runtime_get_sync(dev->dev);
@@ -4671,7 +4671,7 @@ static ssize_t amdgpu_pm_prv_buffer_read(struct file *f, char __user *buf,

          if (amdgpu_in_reset(adev))
                  return -EPERM;
-       if (adev->in_suspend && !adev->in_runpm)
+       if (adev->in_suspend || adev->in_runpm)
                  return -EPERM;

          ret = amdgpu_dpm_get_smu_prv_buf_details(adev, &smu_prv_buf, &smu_prv_buf_size);
--
2.43.0






[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux