On Thu, Sep 30, 2021 at 1:15 PM Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote: > > On recent Intel systems the HPET stops working when the system reaches PC10 > idle state. > > The approach of adding PCI ids to the early quirks to disable HPET on > these systems is a whack a mole game which makes no sense. > > Check for PC10 instead and force disable HPET if supported. The check is > overbroad as it does not take ACPI, intel_idle enablement and command > line parameters into account. That's fine as long as there is at least > PMTIMER available to calibrate the TSC frequency. The decision can be > overruled by adding "hpet=force" on the kernel command line. > > Remove the related early PCI quirks for affected Ice and Coffin Lake > systems as they are not longer required. > > Fixes: Yet another hardware trainwreck > Reported-by: Jakub Kicinski <kuba@xxxxxxxxxx> > Not-yet-signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > --- > Notes: Completely untested. Use at your own peril. > --- > arch/x86/kernel/early-quirks.c | 6 -- > arch/x86/kernel/hpet.c | 88 +++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 88 insertions(+), 6 deletions(-) > > --- a/arch/x86/kernel/early-quirks.c > +++ b/arch/x86/kernel/early-quirks.c > @@ -714,12 +714,6 @@ static struct chipset early_qrk[] __init > */ > { PCI_VENDOR_ID_INTEL, 0x0f00, > PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, > - { PCI_VENDOR_ID_INTEL, 0x3e20, > - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, > - { PCI_VENDOR_ID_INTEL, 0x3ec4, > - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, > - { PCI_VENDOR_ID_INTEL, 0x8a12, > - PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, > { PCI_VENDOR_ID_BROADCOM, 0x4331, > PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, > {} > --- a/arch/x86/kernel/hpet.c > +++ b/arch/x86/kernel/hpet.c > @@ -10,6 +10,7 @@ > #include <asm/irq_remapping.h> > #include <asm/hpet.h> > #include <asm/time.h> > +#include <asm/mwait.h> > > #undef pr_fmt > #define pr_fmt(fmt) "hpet: " fmt > @@ -916,6 +917,90 @@ static bool __init hpet_counting(void) > return false; > } > > +static bool __init get_mwait_substates(unsigned int *mwait_substates) > +{ > + unsigned int eax, ebx, ecx; > + > + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) > + return false; > + > + if (!boot_cpu_has(X86_FEATURE_MWAIT)) > + return false; > + > + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) > + return false; > + > + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, mwait_substates); > + > + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || > + !(ecx & CPUID5_ECX_INTERRUPT_BREAK) || > + !*mwait_substates) > + return false; I would do return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && (ecx & CPUID5_ECX_INTERRUPT_BREAK) && *mwait_substates; And this function could just return the mwait_substates value proper, because returning 0 then would be equivalent to returning 'false' from it as is. LGTM otherwise. > + > + return true; > +} > + > +/* > + * Check whether the system supports PC10. If so force disable HPET as that > + * stops counting in PC10. This check is overbroad as it does not take any > + * of the following into account: > + * > + * - ACPI tables > + * - Enablement of intel_idle > + * - Command line arguments which limit intel_idle C-state support > + * > + * That's perfectly fine. HPET is a piece of hardware designed by committee > + * and the only reasons why it is still in use on modern systems is the > + * fact that it is impossible to reliably query the TSC frequency via > + * CPUID or firmware. > + * > + * If HPET is functional it is useful for calibrating TSC, but this can be > + * done via PMTIMER as well which seems to be the last remaining timer on > + * X86/INTEL platforms that has not been completely wreckaged by feature > + * creep. > + * > + * In theory HPET support should be removed altogether, but there are older > + * systems out there which depend on it because TSC and APIC timer are > + * dysfunctional in deeper C-states. > + * > + * It's only 20 years now that hardware people have been asked to provide > + * reliable and discoverable facilities which can be used for timekeeping > + * and per CPU timer interrupts. > + * > + * The probability that this problem is going to be solved in the > + * forseeable future is close to zero, so the kernel has to be cluttered > + * with heuristics to keep up with the ever growing amount of hardware and > + * firmware trainwrecks. Hopefully some day hardware people will understand > + * that the approach of "This can be fixed in software" is not sustainable. > + * Hope dies last... > + */ > +static bool __init hpet_is_pc10_damaged(void) > +{ > + unsigned int mwait_substates; > + unsigned long long pcfg; > + > + if (!get_mwait_substates(&mwait_substates)) > + return false; > + > + /* Check whether PC10 substates are supported */ > + if (!(mwait_substates & (0xF << 28))) > + return false; > + > + /* Check whether PC10 is enabled in PKG C-state limit */ > + rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); > + if ((pcfg & 0xF) < 8) > + return false; > + > + if (hpet_force_user) { > + pr_warn("HPET force enabled via command line, but dysfunctional in PC10.\n"); > + return false; > + } > + > + pr_info("HPET dysfunctional in PC10. Force disabled.\n"); > + boot_hpet_disable = true; > + return true; > +} > + > /** > * hpet_enable - Try to setup the HPET timer. Returns 1 on success. > */ > @@ -929,6 +1014,9 @@ int __init hpet_enable(void) > if (!is_hpet_capable()) > return 0; > > + if (hpet_is_pc10_damaged()) > + return 0; > + > hpet_set_mapping(); > if (!hpet_virt_address) > return 0;