On Sat, Feb 19, 2022 at 4:01 AM Limonciello, Mario <Mario.Limonciello@xxxxxxx> wrote: > > [Public] > > > it: > > > > > > > > diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c > > > > index 04ea92cbd9cf..f5bf46782043 100644 > > > > --- a/drivers/base/power/main.c > > > > +++ b/drivers/base/power/main.c > > > > @@ -32,6 +32,7 @@ > > > > #include <linux/suspend.h> > > > > #include <trace/events/power.h> > > > > #include <linux/cpufreq.h> > > > > +#include <linux/cpuidle.h> > > > > #include <linux/devfreq.h> > > > > #include <linux/timer.h> > > > > > > > > @@ -1350,6 +1351,8 @@ int dpm_suspend_noirq(pm_message_t state) > > > > { > > > > int ret; > > > > > > > > + cpuidle_pause(); > > > > > > Can you replace this with wake_up_all_idle_cpus() and remove the > > > cpuidle_resume()/cpuidle_pause() calls from s2idle_enter() and see > > > what happens? > > > > > > > + > > > > device_wakeup_arm_wake_irqs(); > > > > suspend_device_irqs(); > > > > > > > > diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c > > > > index 6fcdee7e87a5..1708a643ba5d 100644 > > > > --- a/kernel/power/suspend.c > > > > +++ b/kernel/power/suspend.c > > > > @@ -97,6 +97,7 @@ static void s2idle_enter(void) > > > > raw_spin_unlock_irq(&s2idle_lock); > > > > > > > > cpus_read_lock(); > > > > + cpuidle_resume(); > > > > > > > > /* Push all the CPUs into the idle loop. */ > > > > wake_up_all_idle_cpus(); > > > > @@ -104,6 +105,7 @@ static void s2idle_enter(void) > > > > swait_event_exclusive(s2idle_wait_head, > > > > s2idle_state == S2IDLE_STATE_WAKE); > > > > > > > > + cpuidle_pause(); > > > > cpus_read_unlock(); > > > > > > > > raw_spin_lock_irq(&s2idle_lock); > > > > > > > > > > > > * Removing the cpuidle_pause call from s2idle_enter will fix the crash, > > > > but the system doesn't enter the deepest sleep state. > > > > > > I think you mean that removing it doesn't make a difference except > > > that it prevents the deepest state from being entered. > > > > > > Pausing cpuidle here is kind of redundant, because it just flips the > > > "initialized" flag that's going to be flipped again by > > > cpuidle_resume() in the next iteration. [BTW, your patch is missing > > > one an additional cpuidle_resume() in the resume path to match this > > > cpuidle_pause().] Also calling wake_up_all_idle_cpus() is not likely > > > to make a difference, so I'm guessing that synchronize_rcu() does the > > > trick. > > > > > > > * removing the cpuidle_pause call from dpm_suspend_noirq the > > firmware continues to > > > > crash. > > > > > > So the crash occurs while running dpm_suspend_noirq(). > > > > > > > I also confirmed that reverting both of these commits together on top of > > 5.17-rc4 fixes it: > > > > > > > > 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to- > > idle path") > > > > 23f62d7ab25b ("PM: sleep: Pause cpuidle later and resume it earlier > > during system transitions") > > > > > > > > The commit messages at least make it sound like it was just a rework for > > unification of the codepaths, > > > > not supposed to be for anything to be actually fixed, so I would think it > > should be safe to revert. > > > > > > There is a deeper issue related to these commits and I'm not inclined > > > to go back to the old code if there is only one system affected by > > > this and the problem is related to the platform firmware on that > > > system. > > > > > > > So please advise which way you want to go and I'll send up a patch (or if > > you want to write one > > > > I'm happy to take it and test it since I can readily reproduce it). > > > > > > To start with, please test the attached debug patch on top of -rc4. > > > > Attached is another patch to try, testing the hypothesis that the > > observed crash is related to CPUs being in idle state that are too > > deep for some reason during late suspend and early resume. > > I tried 3 test kernels: > * 5.17-rc4 + Your second debugging patch > * 5.17-rc4+ Your first debugging patch > * 5.17-rc4 + A hack I wrote that pushed amd-pmc into "later" in the suspend > using a global symbol called after LPS0 instead of letting it run in noirq stage > > It works properly on all of those, tried about 5x time in each. > > Then I confirmed I could still crash it on 5.17-rc4 with my control kernel. I would do something like the attached patch, then (provided that it works).
--- drivers/platform/x86/amd-pmc.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) Index: linux-pm/drivers/platform/x86/amd-pmc.c =================================================================== --- linux-pm.orig/drivers/platform/x86/amd-pmc.c +++ linux-pm/drivers/platform/x86/amd-pmc.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/pci.h> #include <linux/platform_device.h> +#include <linux/pm_qos.h> #include <linux/rtc.h> #include <linux/suspend.h> #include <linux/seq_file.h> @@ -144,6 +145,7 @@ static struct amd_pmc_dev pmc; static int amd_pmc_send_cmd(struct amd_pmc_dev *dev, u32 arg, u32 *data, u8 msg, bool ret); static int amd_pmc_write_stb(struct amd_pmc_dev *dev, u32 data); static int amd_pmc_read_stb(struct amd_pmc_dev *dev, u32 *buf); +static struct pm_qos_request amd_pmc_pm_qos_req; static inline u32 amd_pmc_reg_read(struct amd_pmc_dev *dev, int reg_offset) { @@ -531,6 +533,14 @@ static int __maybe_unused amd_pmc_suspen u8 msg; u32 arg = 1; + /* + * Prevent CPUs from getting into idle states while running the code + * below which is generally safe to run when none of the CPUs are in + * idle states. + */ + cpu_latency_qos_update_request(&amd_pmc_pm_qos_req, 0); + wake_up_all_idle_cpus(); + /* Reset and Start SMU logging - to monitor the s0i3 stats */ amd_pmc_send_cmd(pdev, 0, NULL, SMU_MSG_LOG_RESET, 0); amd_pmc_send_cmd(pdev, 0, NULL, SMU_MSG_LOG_START, 0); @@ -539,7 +549,7 @@ static int __maybe_unused amd_pmc_suspen if (pdev->cpu_id == AMD_CPU_ID_CZN) { rc = amd_pmc_verify_czn_rtc(pdev, &arg); if (rc < 0) - return rc; + goto out; } /* Dump the IdleMask before we send hint to SMU */ @@ -551,10 +561,11 @@ static int __maybe_unused amd_pmc_suspen if (enable_stb) rc = amd_pmc_write_stb(pdev, AMD_PMC_STB_PREDEF); - if (rc) { + if (rc) dev_err(pdev->dev, "error writing to STB\n"); - return rc; - } + +out: + cpu_latency_qos_update_request(&amd_pmc_pm_qos_req, PM_QOS_DEFAULT_VALUE); return rc; } @@ -565,6 +576,14 @@ static int __maybe_unused amd_pmc_resume int rc; u8 msg; + /* + * Prevent CPUs from getting into idle states while running the code + * below which is generally safe to run when none of the CPUs are in + * idle states. + */ + cpu_latency_qos_update_request(&amd_pmc_pm_qos_req, 0); + wake_up_all_idle_cpus(); + msg = amd_pmc_get_os_hint(pdev); rc = amd_pmc_send_cmd(pdev, 0, NULL, msg, 0); if (rc) @@ -579,12 +598,12 @@ static int __maybe_unused amd_pmc_resume /* Write data incremented by 1 to distinguish in stb_read */ if (enable_stb) rc = amd_pmc_write_stb(pdev, AMD_PMC_STB_PREDEF + 1); - if (rc) { + if (rc) dev_err(pdev->dev, "error writing to STB\n"); - return rc; - } - return 0; + cpu_latency_qos_update_request(&amd_pmc_pm_qos_req, PM_QOS_DEFAULT_VALUE); + + return rc; } static const struct dev_pm_ops amd_pmc_pm_ops = { @@ -722,6 +741,7 @@ static int amd_pmc_probe(struct platform amd_pmc_get_smu_version(dev); platform_set_drvdata(pdev, dev); amd_pmc_dbgfs_register(dev); + cpu_latency_qos_add_request(&amd_pmc_pm_qos_req, PM_QOS_DEFAULT_VALUE); return 0; err_pci_dev_put: