On Sat, Jan 8, 2022 at 4:46 AM Ricardo Neri <ricardo.neri-calderon@xxxxxxxxxxxxxxx> wrote: > > When hardware wants to inform the operating system about updates in the HFI > table, it issues a package-level thermal event interrupt. For this, > hardware has new interrupt and status bits in the IA32_PACKAGE_THERM_ > INTERRUPT and IA32_PACKAGE_THERM_STATUS registers. The existing thermal > throttle driver already handles thermal event interrupts: it initializes > the thermal vector of the local APIC as well as per-CPU and package-level > interrupt reporting. It also provides routines to service such interrupts. > Extend its functionality to also handle HFI interrupts. > > The frequency of the thermal HFI interrupt is specific to each processor > model. On some processors, a single interrupt happens as soon as the HFI is > enabled and hardware will never update HFI capabilities afterwards. On > other processors, thermal and power constraints may cause thermal HFI > interrupts every tens of milliseconds. > > To not overwhelm consumers of the HFI data, use delayed work to throttle > the rate at which HFI updates are processed. > > Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx> > Cc: Aubrey Li <aubrey.li@xxxxxxxxxxxxxxx> > Cc: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx> > Cc: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx> > Cc: "Ravi V. Shankar" <ravi.v.shankar@xxxxxxxxx> > Reviewed-by: Len Brown <len.brown@xxxxxxxxx> > Signed-off-by: Ricardo Neri <ricardo.neri-calderon@xxxxxxxxxxxxxxx> > --- > Changes since v3: > * None > > Changes since v2: > * Added missing #include files. > > Changes since v1: > * Renamed X86_FEATURE_INTEL_HFI as X86_FEATURE_HFI. (Boris) > * Repurposed hfi_instance::event_lock to handle HFI interrupt > events to avoid keeping CPUs spinning needlessly. Added a new > hfi_instance::table_lock to serialize access to an HFI table. > (PeterZ) > * Replaced raw_spin_[un]lock[restore|save]() with raw_spin_[un]lock(). > intel_hfi_process_event() runs in interrupt context and hence there > is no need to save interrupt flags. > * Renamed HFI_CONFIG_ENABLE_BIT as HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT > for clarity. > * Reworked timestamp updates for readability. Removed redundant > hfi_instance::timestamp. > * Relaxed timestamp check to allow time wrap-around. > --- > drivers/thermal/intel/intel_hfi.c | 103 ++++++++++++++++++++++++++++ > drivers/thermal/intel/intel_hfi.h | 2 + > drivers/thermal/intel/therm_throt.c | 10 +++ > 3 files changed, 115 insertions(+) > > diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c > index 37f979a9e833..1a08c58f26f6 100644 > --- a/drivers/thermal/intel/intel_hfi.c > +++ b/drivers/thermal/intel/intel_hfi.c > @@ -26,20 +26,28 @@ > #include <linux/cpumask.h> > #include <linux/gfp.h> > #include <linux/io.h> > +#include <linux/kernel.h> > #include <linux/math.h> > #include <linux/mutex.h> > #include <linux/percpu-defs.h> > #include <linux/printk.h> > #include <linux/processor.h> > #include <linux/slab.h> > +#include <linux/spinlock.h> > +#include <linux/string.h> > #include <linux/topology.h> > +#include <linux/workqueue.h> > > #include <asm/msr.h> > > #include "intel_hfi.h" > > +#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | \ > + BIT(9) | BIT(11) | BIT(26)) > + > /* Hardware Feedback Interface MSR configuration bits */ > #define HW_FEEDBACK_PTR_VALID_BIT BIT(0) > +#define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT BIT(0) > > /* CPUID detection and enumeration definitions for HFI */ > > @@ -98,6 +106,9 @@ struct hfi_hdr { > * @data: Base address of the local table data > * @cpus: CPUs represented in this HFI table instance > * @hw_table: Pointer to the HFI table of this instance > + * @update_work: Delayed work to process HFI updates > + * @table_lock: Lock to protect acceses to the table of this instance > + * @event_lock: Lock to process HFI interrupts > * > * A set of parameters to parse and navigate a specific HFI table. > */ > @@ -110,6 +121,9 @@ struct hfi_instance { > void *data; > cpumask_var_t cpus; > void *hw_table; > + struct delayed_work update_work; > + raw_spinlock_t table_lock; > + raw_spinlock_t event_lock; > }; > > /** > @@ -147,6 +161,83 @@ static struct hfi_instance *hfi_instances; > static struct hfi_features hfi_features; > static DEFINE_MUTEX(hfi_instance_lock); > > +#define HFI_UPDATE_INTERVAL HZ > + > +static void hfi_update_work_fn(struct work_struct *work) > +{ > + struct hfi_instance *hfi_instance; > + > + hfi_instance = container_of(to_delayed_work(work), struct hfi_instance, > + update_work); > + if (!hfi_instance) > + return; > + > + /* TODO: Consume update here. */ > +} > + > +void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) > +{ > + struct hfi_instance *hfi_instance; > + int cpu = smp_processor_id(); > + struct hfi_cpu_info *info; > + u64 new_timestamp; > + > + if (!pkg_therm_status_msr_val) > + return; > + > + info = &per_cpu(hfi_cpu_info, cpu); > + if (!info) > + return; > + > + /* > + * It is possible that we get an HFI thermal interrupt on this CPU > + * before its HFI instance is initialized. This is not a problem. The > + * CPU that enabled the interrupt for this package will also get the > + * interrupt and is fully initialized. > + */ > + hfi_instance = info->hfi_instance; > + if (!hfi_instance) > + return; Generally, a CPU whose info has been initialized can be offline, so this code may run on an offline CPU. I'm not actually sure if this is a concern, but just mentioning it in case it is. > + > + /* > + * On most systems, all CPUs in the package receive a package-level > + * thermal interrupt when there is an HFI update. It is sufficient to > + * let a single CPU to acknowledge the update and schedule work to > + * process it. The remaining CPUs can resume their work. > + */ > + if (!raw_spin_trylock(&hfi_instance->event_lock)) > + return; > + > + /* Skip duplicated updates. */ > + new_timestamp = *(u64 *)hfi_instance->hw_table; > + if (*hfi_instance->timestamp == new_timestamp) { > + raw_spin_unlock(&hfi_instance->event_lock); > + return; > + } > + > + raw_spin_lock(&hfi_instance->table_lock); > + > + /* > + * Copy the updated table into our local copy. This includes the new > + * timestamp. > + */ > + memcpy(hfi_instance->local_table, hfi_instance->hw_table, > + hfi_features.nr_table_pages << PAGE_SHIFT); > + > + raw_spin_unlock(&hfi_instance->table_lock); > + raw_spin_unlock(&hfi_instance->event_lock); > + > + /* > + * Let hardware know that we are done reading the HFI table and it is > + * free to update it again. > + */ > + pkg_therm_status_msr_val &= THERM_STATUS_CLEAR_PKG_MASK & > + ~PACKAGE_THERM_STATUS_HFI_UPDATED; > + wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, pkg_therm_status_msr_val); > + > + schedule_delayed_work(&hfi_instance->update_work, HFI_UPDATE_INTERVAL); AFAICS, if update_work has been scheduled already, but is not pending yet, the delay will be set to the current time plus HFI_UPDATE_INTERVAL, but shouldn't it actually run earlier in that case? Also it looks like the processing introduced in the next patch can take quite a bit of time if there is a sufficiently large number of CPUs in the package, so is it suitable for system_wq in all cases? > +} > + > static void init_hfi_cpu_index(struct hfi_cpu_info *info) > { > union cpuid6_edx edx; > @@ -259,8 +350,20 @@ void intel_hfi_online(unsigned int cpu) > > init_hfi_instance(hfi_instance); > > + INIT_DELAYED_WORK(&hfi_instance->update_work, hfi_update_work_fn); > + raw_spin_lock_init(&hfi_instance->table_lock); > + raw_spin_lock_init(&hfi_instance->event_lock); > + > cpumask_set_cpu(cpu, hfi_instance->cpus); > > + /* > + * Enable the hardware feedback interface and never disable it. See > + * comment on programming the address of the table. > + */ > + rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); > + msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT; > + wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val); > + > unlock: > mutex_unlock(&hfi_instance_lock); > return; > diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h > index 56c6b2d75202..325aa78b745c 100644 > --- a/drivers/thermal/intel/intel_hfi.h > +++ b/drivers/thermal/intel/intel_hfi.h > @@ -6,10 +6,12 @@ > void __init intel_hfi_init(void); > void intel_hfi_online(unsigned int cpu); > void intel_hfi_offline(unsigned int cpu); > +void intel_hfi_process_event(__u64 pkg_therm_status_msr_val); > #else > static inline void intel_hfi_init(void) { } > static inline void intel_hfi_online(unsigned int cpu) { } > static inline void intel_hfi_offline(unsigned int cpu) { } > +static inline void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) { } > #endif /* CONFIG_INTEL_HFI_THERMAL */ > > #endif /* _INTEL_HFI_H */ > diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c > index 2a79598a7f7a..930e19eeeac6 100644 > --- a/drivers/thermal/intel/therm_throt.c > +++ b/drivers/thermal/intel/therm_throt.c > @@ -619,6 +619,10 @@ void intel_thermal_interrupt(void) > PACKAGE_THERM_STATUS_POWER_LIMIT, > POWER_LIMIT_EVENT, > PACKAGE_LEVEL); > + > + if (this_cpu_has(X86_FEATURE_HFI)) > + intel_hfi_process_event(msr_val & > + PACKAGE_THERM_STATUS_HFI_UPDATED); > } > } > > @@ -728,6 +732,12 @@ void intel_init_thermal(struct cpuinfo_x86 *c) > wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, > l | (PACKAGE_THERM_INT_LOW_ENABLE > | PACKAGE_THERM_INT_HIGH_ENABLE), h); > + > + if (cpu_has(c, X86_FEATURE_HFI)) { > + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); > + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, > + l | PACKAGE_THERM_INT_HFI_ENABLE, h); > + } > } > > rdmsr(MSR_IA32_MISC_ENABLE, l, h); > -- > 2.17.1 >