Re: [PATCH v4 5/7] thermal: intel: hfi: Enable notification interrupt

"Rafael J. Wysocki" <rafael@xxxxxxxxxx> · Wed, 12 Jan 2022 20:47:31 +0100

On Sat, Jan 8, 2022 at 4:46 AM Ricardo Neri
<ricardo.neri-calderon@xxxxxxxxxxxxxxx> wrote:
>
> When hardware wants to inform the operating system about updates in the HFI
> table, it issues a package-level thermal event interrupt. For this,
> hardware has new interrupt and status bits in the IA32_PACKAGE_THERM_
> INTERRUPT and IA32_PACKAGE_THERM_STATUS registers. The existing thermal
> throttle driver already handles thermal event interrupts: it initializes
> the thermal vector of the local APIC as well as per-CPU and package-level
> interrupt reporting. It also provides routines to service such interrupts.
> Extend its functionality to also handle HFI interrupts.
>
> The frequency of the thermal HFI interrupt is specific to each processor
> model. On some processors, a single interrupt happens as soon as the HFI is
> enabled and hardware will never update HFI capabilities afterwards. On
> other processors, thermal and power constraints may cause thermal HFI
> interrupts every tens of milliseconds.
>
> To not overwhelm consumers of the HFI data, use delayed work to throttle
> the rate at which HFI updates are processed.
>
> Cc: Andi Kleen <ak@xxxxxxxxxxxxxxx>
> Cc: Aubrey Li <aubrey.li@xxxxxxxxxxxxxxx>
> Cc: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx>
> Cc: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx>
> Cc: "Ravi V. Shankar" <ravi.v.shankar@xxxxxxxxx>
> Reviewed-by: Len Brown <len.brown@xxxxxxxxx>
> Signed-off-by: Ricardo Neri <ricardo.neri-calderon@xxxxxxxxxxxxxxx>
> ---
> Changes since v3:
>   * None
>
> Changes since v2:
>   * Added missing #include files.
>
> Changes since v1:
>   * Renamed X86_FEATURE_INTEL_HFI as X86_FEATURE_HFI. (Boris)
>   * Repurposed hfi_instance::event_lock to handle HFI interrupt
>     events to avoid keeping CPUs spinning needlessly. Added a new
>     hfi_instance::table_lock to serialize access to an HFI table.
>     (PeterZ)
>   * Replaced raw_spin_[un]lock[restore|save]() with raw_spin_[un]lock().
>     intel_hfi_process_event() runs in interrupt context and hence there
>     is no need to save interrupt flags.
>   * Renamed HFI_CONFIG_ENABLE_BIT as HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT
>     for clarity.
>   * Reworked timestamp updates for readability. Removed redundant
>     hfi_instance::timestamp.
>   * Relaxed timestamp check to allow time wrap-around.
> ---
>  drivers/thermal/intel/intel_hfi.c   | 103 ++++++++++++++++++++++++++++
>  drivers/thermal/intel/intel_hfi.h   |   2 +
>  drivers/thermal/intel/therm_throt.c |  10 +++
>  3 files changed, 115 insertions(+)
>
> diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
> index 37f979a9e833..1a08c58f26f6 100644
> --- a/drivers/thermal/intel/intel_hfi.c
> +++ b/drivers/thermal/intel/intel_hfi.c
> @@ -26,20 +26,28 @@
>  #include <linux/cpumask.h>
>  #include <linux/gfp.h>
>  #include <linux/io.h>
> +#include <linux/kernel.h>
>  #include <linux/math.h>
>  #include <linux/mutex.h>
>  #include <linux/percpu-defs.h>
>  #include <linux/printk.h>
>  #include <linux/processor.h>
>  #include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/string.h>
>  #include <linux/topology.h>
> +#include <linux/workqueue.h>
>
>  #include <asm/msr.h>
>
>  #include "intel_hfi.h"
>
> +#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | \
> +                                    BIT(9) | BIT(11) | BIT(26))
> +
>  /* Hardware Feedback Interface MSR configuration bits */
>  #define HW_FEEDBACK_PTR_VALID_BIT              BIT(0)
> +#define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT      BIT(0)
>
>  /* CPUID detection and enumeration definitions for HFI */
>
> @@ -98,6 +106,9 @@ struct hfi_hdr {
>   * @data:              Base address of the local table data
>   * @cpus:              CPUs represented in this HFI table instance
>   * @hw_table:          Pointer to the HFI table of this instance
> + * @update_work:       Delayed work to process HFI updates
> + * @table_lock:                Lock to protect acceses to the table of this instance
> + * @event_lock:                Lock to process HFI interrupts
>   *
>   * A set of parameters to parse and navigate a specific HFI table.
>   */
> @@ -110,6 +121,9 @@ struct hfi_instance {
>         void                    *data;
>         cpumask_var_t           cpus;
>         void                    *hw_table;
> +       struct delayed_work     update_work;
> +       raw_spinlock_t          table_lock;
> +       raw_spinlock_t          event_lock;
>  };
>
>  /**
> @@ -147,6 +161,83 @@ static struct hfi_instance *hfi_instances;
>  static struct hfi_features hfi_features;
>  static DEFINE_MUTEX(hfi_instance_lock);
>
> +#define HFI_UPDATE_INTERVAL    HZ
> +
> +static void hfi_update_work_fn(struct work_struct *work)
> +{
> +       struct hfi_instance *hfi_instance;
> +
> +       hfi_instance = container_of(to_delayed_work(work), struct hfi_instance,
> +                                   update_work);
> +       if (!hfi_instance)
> +               return;
> +
> +       /* TODO: Consume update here. */
> +}
> +
> +void intel_hfi_process_event(__u64 pkg_therm_status_msr_val)
> +{
> +       struct hfi_instance *hfi_instance;
> +       int cpu = smp_processor_id();
> +       struct hfi_cpu_info *info;
> +       u64 new_timestamp;
> +
> +       if (!pkg_therm_status_msr_val)
> +               return;
> +
> +       info = &per_cpu(hfi_cpu_info, cpu);
> +       if (!info)
> +               return;
> +
> +       /*
> +        * It is possible that we get an HFI thermal interrupt on this CPU
> +        * before its HFI instance is initialized. This is not a problem. The
> +        * CPU that enabled the interrupt for this package will also get the
> +        * interrupt and is fully initialized.
> +        */
> +       hfi_instance = info->hfi_instance;
> +       if (!hfi_instance)
> +               return;

Generally, a CPU whose info has been initialized can be offline, so
this code may run on an offline CPU.

I'm not actually sure if this is a concern, but just mentioning it in
case it is.

> +
> +       /*
> +        * On most systems, all CPUs in the package receive a package-level
> +        * thermal interrupt when there is an HFI update. It is sufficient to
> +        * let a single CPU to acknowledge the update and schedule work to
> +        * process it. The remaining CPUs can resume their work.
> +        */
> +       if (!raw_spin_trylock(&hfi_instance->event_lock))
> +               return;
> +
> +       /* Skip duplicated updates. */
> +       new_timestamp = *(u64 *)hfi_instance->hw_table;
> +       if (*hfi_instance->timestamp == new_timestamp) {
> +               raw_spin_unlock(&hfi_instance->event_lock);
> +               return;
> +       }
> +
> +       raw_spin_lock(&hfi_instance->table_lock);
> +
> +       /*
> +        * Copy the updated table into our local copy. This includes the new
> +        * timestamp.
> +        */
> +       memcpy(hfi_instance->local_table, hfi_instance->hw_table,
> +              hfi_features.nr_table_pages << PAGE_SHIFT);
> +
> +       raw_spin_unlock(&hfi_instance->table_lock);
> +       raw_spin_unlock(&hfi_instance->event_lock);
> +
> +       /*
> +        * Let hardware know that we are done reading the HFI table and it is
> +        * free to update it again.
> +        */
> +       pkg_therm_status_msr_val &= THERM_STATUS_CLEAR_PKG_MASK &
> +                                   ~PACKAGE_THERM_STATUS_HFI_UPDATED;
> +       wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, pkg_therm_status_msr_val);
> +
> +       schedule_delayed_work(&hfi_instance->update_work, HFI_UPDATE_INTERVAL);

AFAICS, if update_work has been scheduled already, but is not pending
yet, the delay will be set to the current time plus
HFI_UPDATE_INTERVAL, but shouldn't it actually run earlier in that
case?

Also it looks like the processing introduced in the next patch can
take quite a bit of time if there is a sufficiently large number of
CPUs in the package, so is it suitable for system_wq in all cases?

> +}
> +
>  static void init_hfi_cpu_index(struct hfi_cpu_info *info)
>  {
>         union cpuid6_edx edx;
> @@ -259,8 +350,20 @@ void intel_hfi_online(unsigned int cpu)
>
>         init_hfi_instance(hfi_instance);
>
> +       INIT_DELAYED_WORK(&hfi_instance->update_work, hfi_update_work_fn);
> +       raw_spin_lock_init(&hfi_instance->table_lock);
> +       raw_spin_lock_init(&hfi_instance->event_lock);
> +
>         cpumask_set_cpu(cpu, hfi_instance->cpus);
>
> +       /*
> +        * Enable the hardware feedback interface and never disable it. See
> +        * comment on programming the address of the table.
> +        */
> +       rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
> +       msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT;
> +       wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
> +
>  unlock:
>         mutex_unlock(&hfi_instance_lock);
>         return;
> diff --git a/drivers/thermal/intel/intel_hfi.h b/drivers/thermal/intel/intel_hfi.h
> index 56c6b2d75202..325aa78b745c 100644
> --- a/drivers/thermal/intel/intel_hfi.h
> +++ b/drivers/thermal/intel/intel_hfi.h
> @@ -6,10 +6,12 @@
>  void __init intel_hfi_init(void);
>  void intel_hfi_online(unsigned int cpu);
>  void intel_hfi_offline(unsigned int cpu);
> +void intel_hfi_process_event(__u64 pkg_therm_status_msr_val);
>  #else
>  static inline void intel_hfi_init(void) { }
>  static inline void intel_hfi_online(unsigned int cpu) { }
>  static inline void intel_hfi_offline(unsigned int cpu) { }
> +static inline void intel_hfi_process_event(__u64 pkg_therm_status_msr_val) { }
>  #endif /* CONFIG_INTEL_HFI_THERMAL */
>
>  #endif /* _INTEL_HFI_H */
> diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c
> index 2a79598a7f7a..930e19eeeac6 100644
> --- a/drivers/thermal/intel/therm_throt.c
> +++ b/drivers/thermal/intel/therm_throt.c
> @@ -619,6 +619,10 @@ void intel_thermal_interrupt(void)
>                                         PACKAGE_THERM_STATUS_POWER_LIMIT,
>                                         POWER_LIMIT_EVENT,
>                                         PACKAGE_LEVEL);
> +
> +               if (this_cpu_has(X86_FEATURE_HFI))
> +                       intel_hfi_process_event(msr_val &
> +                                               PACKAGE_THERM_STATUS_HFI_UPDATED);
>         }
>  }
>
> @@ -728,6 +732,12 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
>                         wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
>                               l | (PACKAGE_THERM_INT_LOW_ENABLE
>                                 | PACKAGE_THERM_INT_HIGH_ENABLE), h);
> +
> +               if (cpu_has(c, X86_FEATURE_HFI)) {
> +                       rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
> +                       wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
> +                             l | PACKAGE_THERM_INT_HFI_ENABLE, h);
> +               }
>         }
>
>         rdmsr(MSR_IA32_MISC_ENABLE, l, h);
> --
> 2.17.1
>