On Mon, 2021-07-26 at 22:08 +0000, Zev Weiss wrote: > On Mon, Jul 12, 2021 at 05:04:45PM CDT, Iwona Winiarska wrote: > > Add peci-dimmtemp driver for Digital Thermal Sensor (DTS) thermal > > readings of DIMMs that are accessible via the processor PECI interface. > > > > The main use case for the driver (and PECI interface) is out-of-band > > management, where we're able to obtain the DTS readings from an external > > entity connected with PECI, e.g. BMC on server platforms. > > > > Co-developed-by: Jae Hyun Yoo <jae.hyun.yoo@xxxxxxxxxxxxxxx> > > Signed-off-by: Jae Hyun Yoo <jae.hyun.yoo@xxxxxxxxxxxxxxx> > > Signed-off-by: Iwona Winiarska <iwona.winiarska@xxxxxxxxx> > > Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@xxxxxxxxxxxxxxx> > > --- > > drivers/hwmon/peci/Kconfig | 13 + > > drivers/hwmon/peci/Makefile | 2 + > > drivers/hwmon/peci/dimmtemp.c | 508 ++++++++++++++++++++++++++++++++++ > > 3 files changed, 523 insertions(+) > > create mode 100644 drivers/hwmon/peci/dimmtemp.c > > > > diff --git a/drivers/hwmon/peci/Kconfig b/drivers/hwmon/peci/Kconfig > > index e10eed68d70a..f2d57efa508b 100644 > > --- a/drivers/hwmon/peci/Kconfig > > +++ b/drivers/hwmon/peci/Kconfig > > @@ -14,5 +14,18 @@ config SENSORS_PECI_CPUTEMP > > This driver can also be built as a module. If so, the module > > will be called peci-cputemp. > > > > +config SENSORS_PECI_DIMMTEMP > > + tristate "PECI DIMM temperature monitoring client" > > + depends on PECI > > + select SENSORS_PECI > > + select PECI_CPU > > + help > > + If you say yes here you get support for the generic Intel PECI hwmon > > + driver which provides Digital Thermal Sensor (DTS) thermal readings > > of > > + DIMM components that are accessible via the processor PECI > > interface. > > + > > + This driver can also be built as a module. If so, the module > > + will be called peci-dimmtemp. > > + > > config SENSORS_PECI > > tristate > > diff --git a/drivers/hwmon/peci/Makefile b/drivers/hwmon/peci/Makefile > > index e8a0ada5ab1f..191cfa0227f3 100644 > > --- a/drivers/hwmon/peci/Makefile > > +++ b/drivers/hwmon/peci/Makefile > > @@ -1,5 +1,7 @@ > > # SPDX-License-Identifier: GPL-2.0-only > > > > peci-cputemp-y := cputemp.o > > +peci-dimmtemp-y := dimmtemp.o > > > > obj-$(CONFIG_SENSORS_PECI_CPUTEMP) += peci-cputemp.o > > +obj-$(CONFIG_SENSORS_PECI_DIMMTEMP) += peci-dimmtemp.o > > diff --git a/drivers/hwmon/peci/dimmtemp.c b/drivers/hwmon/peci/dimmtemp.c > > new file mode 100644 > > index 000000000000..2fcb8607137a > > --- /dev/null > > +++ b/drivers/hwmon/peci/dimmtemp.c > > @@ -0,0 +1,508 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > +// Copyright (c) 2018-2021 Intel Corporation > > + > > +#include <linux/auxiliary_bus.h> > > +#include <linux/bitfield.h> > > +#include <linux/bitops.h> > > +#include <linux/hwmon.h> > > +#include <linux/jiffies.h> > > +#include <linux/module.h> > > +#include <linux/peci.h> > > +#include <linux/peci-cpu.h> > > +#include <linux/units.h> > > +#include <linux/workqueue.h> > > +#include <linux/x86/intel-family.h> > > + > > +#include "common.h" > > + > > +#define DIMM_MASK_CHECK_DELAY_JIFFIES msecs_to_jiffies(5000) > > +#define DIMM_MASK_CHECK_RETRY_MAX 60 /* 60 x 5 secs = 5 minutes */ > > + > > +/* Max number of channel ranks and DIMM index per channel */ > > +#define CHAN_RANK_MAX_ON_HSX 8 > > +#define DIMM_IDX_MAX_ON_HSX 3 > > +#define CHAN_RANK_MAX_ON_BDX 4 > > +#define DIMM_IDX_MAX_ON_BDX 3 > > +#define CHAN_RANK_MAX_ON_BDXD 2 > > +#define DIMM_IDX_MAX_ON_BDXD 2 > > +#define CHAN_RANK_MAX_ON_SKX 6 > > +#define DIMM_IDX_MAX_ON_SKX 2 > > +#define CHAN_RANK_MAX_ON_ICX 8 > > +#define DIMM_IDX_MAX_ON_ICX 2 > > +#define CHAN_RANK_MAX_ON_ICXD 4 > > +#define DIMM_IDX_MAX_ON_ICXD 2 > > + > > +#define CHAN_RANK_MAX CHAN_RANK_MAX_ON_HSX > > +#define DIMM_IDX_MAX DIMM_IDX_MAX_ON_HSX > > +#define DIMM_NUMS_MAX (CHAN_RANK_MAX * DIMM_IDX_MAX) > > Should we perhaps have a static_assert(DIMM_NUMS_MAX <= 64) so that > check_populated_dimms() doesn't silently break if we ever have a system > with > 64 dimms? (Not sure how far off that might be, but doesn't seem > *that* wildly inconceivable, anyway.) Ack, I'll add an assert and warn in check_populated_dimms(): BUILD_BUG_ON(DIMM_NUMS_MAX > 64); if (chan_rank_max * dimm_idx_max > DIMM_NUMS_MAX) { WARN_ONCE(1, "Unsupported number of DIMMs"); return -EINVAL; } > > On a similar note, it'd be nice if there were some neat way of > automating the maintenance of CHAN_RANK_MAX and DIMM_IDX_MAX, but I > don't know of any great solutions for that offhand. (Shrug.) With the added WARN it should be easy enough to be catch it without being an issue. > > > + > > +#define CPU_SEG_MASK GENMASK(23, 16) > > +#define GET_CPU_SEG(x) (((x) & CPU_SEG_MASK) >> 16) > > +#define CPU_BUS_MASK GENMASK(7, 0) > > +#define GET_CPU_BUS(x) ((x) & CPU_BUS_MASK) > > + > > +#define DIMM_TEMP_MAX GENMASK(15, 8) > > +#define DIMM_TEMP_CRIT GENMASK(23, 16) > > +#define GET_TEMP_MAX(x) (((x) & DIMM_TEMP_MAX) >> 8) > > +#define GET_TEMP_CRIT(x) (((x) & DIMM_TEMP_CRIT) >> 16) > > + > > +struct dimm_info { > > + int chan_rank_max; > > + int dimm_idx_max; > > + u8 min_peci_revision; > > This field doesn't seem to be used for anything that I can see; is it > really needed? Just like in cputemp - for device sanity check. > > > +}; > > + > > +struct peci_dimmtemp { > > + struct peci_device *peci_dev; > > + struct device *dev; > > + const char *name; > > + const struct dimm_info *gen_info; > > + struct delayed_work detect_work; > > + struct peci_sensor_data temp[DIMM_NUMS_MAX]; > > + long temp_max[DIMM_NUMS_MAX]; > > + long temp_crit[DIMM_NUMS_MAX]; > > + int retry_count; > > + char **dimmtemp_label; > > + DECLARE_BITMAP(dimm_mask, DIMM_NUMS_MAX); > > +}; > > + > > +static u8 __dimm_temp(u32 reg, int dimm_order) > > +{ > > + return (reg >> (dimm_order * 8)) & 0xff; > > +} > > + > > +static int get_dimm_temp(struct peci_dimmtemp *priv, int dimm_no) > > +{ > > + int dimm_order = dimm_no % priv->gen_info->dimm_idx_max; > > + int chan_rank = dimm_no / priv->gen_info->dimm_idx_max; > > + struct peci_device *peci_dev = priv->peci_dev; > > + u8 cpu_seg, cpu_bus, dev, func; > > + u64 offset; > > + u32 data; > > + u16 reg; > > + int ret; > > + > > + if (!peci_sensor_need_update(&priv->temp[dimm_no])) > > + return 0; > > + > > + ret = peci_pcs_read(peci_dev, PECI_PCS_DDR_DIMM_TEMP, chan_rank, > > &data); > > + if (ret) > > + return ret; > > + > > + priv->temp[dimm_no].value = __dimm_temp(data, dimm_order) * > > MILLIDEGREE_PER_DEGREE; > > + > > + switch (peci_dev->info.model) { > > + case INTEL_FAM6_ICELAKE_X: > > + case INTEL_FAM6_ICELAKE_D: > > + ret = peci_ep_pci_local_read(peci_dev, 0, 13, 0, 2, 0xd4, > > &data); > > + if (ret || !(data & BIT(31))) > > + break; /* Use default or previous value */ > > + > > + ret = peci_ep_pci_local_read(peci_dev, 0, 13, 0, 2, 0xd0, > > &data); > > + if (ret) > > + break; /* Use default or previous value */ > > + > > + cpu_seg = GET_CPU_SEG(data); > > + cpu_bus = GET_CPU_BUS(data); > > + > > + /* > > + * Device 26, Offset 224e0: IMC 0 channel 0 -> rank 0 > > + * Device 26, Offset 264e0: IMC 0 channel 1 -> rank 1 > > + * Device 27, Offset 224e0: IMC 1 channel 0 -> rank 2 > > + * Device 27, Offset 264e0: IMC 1 channel 1 -> rank 3 > > + * Device 28, Offset 224e0: IMC 2 channel 0 -> rank 4 > > + * Device 28, Offset 264e0: IMC 2 channel 1 -> rank 5 > > + * Device 29, Offset 224e0: IMC 3 channel 0 -> rank 6 > > + * Device 29, Offset 264e0: IMC 3 channel 1 -> rank 7 > > + */ > > + dev = 0x1a + chan_rank / 2; > > + offset = 0x224e0 + dimm_order * 4; > > + if (chan_rank % 2) > > + offset += 0x4000; > > + > > + ret = peci_mmio_read(peci_dev, 0, cpu_seg, cpu_bus, dev, 0, > > offset, &data); > > + if (ret) > > + return ret; > > + > > + priv->temp_max[dimm_no] = GET_TEMP_MAX(data) * > > MILLIDEGREE_PER_DEGREE; > > + priv->temp_crit[dimm_no] = GET_TEMP_CRIT(data) * > > MILLIDEGREE_PER_DEGREE; > > These two lines look identical in all (non-default) cases; should we > deduplicate them by just moving them to after the switch? I'll refactor this. > > > + > > + break; > > + case INTEL_FAM6_SKYLAKE_X: > > + /* > > + * Device 10, Function 2: IMC 0 channel 0 -> rank 0 > > + * Device 10, Function 6: IMC 0 channel 1 -> rank 1 > > + * Device 11, Function 2: IMC 0 channel 2 -> rank 2 > > + * Device 12, Function 2: IMC 1 channel 0 -> rank 3 > > + * Device 12, Function 6: IMC 1 channel 1 -> rank 4 > > + * Device 13, Function 2: IMC 1 channel 2 -> rank 5 > > + */ > > + dev = 10 + chan_rank / 3 * 2 + (chan_rank % 3 == 2 ? 1 : 0); > > + func = chan_rank % 3 == 1 ? 6 : 2; > > + reg = 0x120 + dimm_order * 4; > > + > > + ret = peci_pci_local_read(peci_dev, 2, dev, func, reg, > > &data); > > + if (ret) > > + return ret; > > + > > + priv->temp_max[dimm_no] = GET_TEMP_MAX(data) * > > MILLIDEGREE_PER_DEGREE; > > + priv->temp_crit[dimm_no] = GET_TEMP_CRIT(data) * > > MILLIDEGREE_PER_DEGREE; > > + > > + break; > > + case INTEL_FAM6_BROADWELL_D: > > + /* > > + * Device 10, Function 2: IMC 0 channel 0 -> rank 0 > > + * Device 10, Function 6: IMC 0 channel 1 -> rank 1 > > + * Device 12, Function 2: IMC 1 channel 0 -> rank 2 > > + * Device 12, Function 6: IMC 1 channel 1 -> rank 3 > > + */ > > + dev = 10 + chan_rank / 2 * 2; > > + func = (chan_rank % 2) ? 6 : 2; > > + reg = 0x120 + dimm_order * 4; > > + > > + ret = peci_pci_local_read(peci_dev, 2, dev, func, reg, > > &data); > > + if (ret) > > + return ret; > > + > > + priv->temp_max[dimm_no] = GET_TEMP_MAX(data) * > > MILLIDEGREE_PER_DEGREE; > > + priv->temp_crit[dimm_no] = GET_TEMP_CRIT(data) * > > MILLIDEGREE_PER_DEGREE; > > + > > + break; > > + case INTEL_FAM6_HASWELL_X: > > + case INTEL_FAM6_BROADWELL_X: > > + /* > > + * Device 20, Function 0: IMC 0 channel 0 -> rank 0 > > + * Device 20, Function 1: IMC 0 channel 1 -> rank 1 > > + * Device 21, Function 0: IMC 0 channel 2 -> rank 2 > > + * Device 21, Function 1: IMC 0 channel 3 -> rank 3 > > + * Device 23, Function 0: IMC 1 channel 0 -> rank 4 > > + * Device 23, Function 1: IMC 1 channel 1 -> rank 5 > > + * Device 24, Function 0: IMC 1 channel 2 -> rank 6 > > + * Device 24, Function 1: IMC 1 channel 3 -> rank 7 > > + */ > > + dev = 20 + chan_rank / 2 + chan_rank / 4; > > + func = chan_rank % 2; > > + reg = 0x120 + dimm_order * 4; > > + > > + ret = peci_pci_local_read(peci_dev, 1, dev, func, reg, > > &data); > > + if (ret) > > + return ret; > > + > > + priv->temp_max[dimm_no] = GET_TEMP_MAX(data) * > > MILLIDEGREE_PER_DEGREE; > > + priv->temp_crit[dimm_no] = GET_TEMP_CRIT(data) * > > MILLIDEGREE_PER_DEGREE; > > + > > + break; > > + default: > > + return -EOPNOTSUPP; > > + } > > + > > + peci_sensor_mark_updated(&priv->temp[dimm_no]); > > + > > + return 0; > > +} > > + > > +static int dimmtemp_read_string(struct device *dev, > > + enum hwmon_sensor_types type, > > + u32 attr, int channel, const char **str) > > +{ > > + struct peci_dimmtemp *priv = dev_get_drvdata(dev); > > + > > + if (attr != hwmon_temp_label) > > + return -EOPNOTSUPP; > > + > > + *str = (const char *)priv->dimmtemp_label[channel]; > > + > > + return 0; > > +} > > + > > +static int dimmtemp_read(struct device *dev, enum hwmon_sensor_types type, > > + u32 attr, int channel, long *val) > > +{ > > + struct peci_dimmtemp *priv = dev_get_drvdata(dev); > > + int ret; > > + > > + ret = get_dimm_temp(priv, channel); > > + if (ret) > > + return ret; > > + > > + switch (attr) { > > + case hwmon_temp_input: > > + *val = priv->temp[channel].value; > > + break; > > + case hwmon_temp_max: > > + *val = priv->temp_max[channel]; > > + break; > > + case hwmon_temp_crit: > > + *val = priv->temp_crit[channel]; > > + break; > > + default: > > + return -EOPNOTSUPP; > > + } > > + > > + return 0; > > +} > > + > > +static umode_t dimmtemp_is_visible(const void *data, enum > > hwmon_sensor_types type, > > + u32 attr, int channel) > > +{ > > + const struct peci_dimmtemp *priv = data; > > + > > + if (test_bit(channel, priv->dimm_mask)) > > + return 0444; > > + > > + return 0; > > +} > > + > > +static const struct hwmon_ops peci_dimmtemp_ops = { > > + .is_visible = dimmtemp_is_visible, > > + .read_string = dimmtemp_read_string, > > + .read = dimmtemp_read, > > +}; > > + > > +static int check_populated_dimms(struct peci_dimmtemp *priv) > > +{ > > + int chan_rank_max = priv->gen_info->chan_rank_max; > > + int dimm_idx_max = priv->gen_info->dimm_idx_max; > > + int chan_rank, dimm_idx, ret; > > + u64 dimm_mask = 0; > > + u32 pcs; > > + > > + for (chan_rank = 0; chan_rank < chan_rank_max; chan_rank++) { > > + ret = peci_pcs_read(priv->peci_dev, PECI_PCS_DDR_DIMM_TEMP, > > chan_rank, &pcs); > > + if (ret) { > > + /* > > + * Overall, we expect either success or -EINVAL in > > + * order to determine whether DIMM is populated or > > not. > > + * For anything else - we fall back to defering the > > + * detection to be performed at a later point in > > time. > > + */ > > + if (ret == -EINVAL) > > + continue; > > + else > > + return -EAGAIN; > > + } > > + > > + for (dimm_idx = 0; dimm_idx < dimm_idx_max; dimm_idx++) > > + if (__dimm_temp(pcs, dimm_idx)) > > + dimm_mask |= BIT(chan_rank * dimm_idx_max + > > dimm_idx); > > + } > > + /* > > + * It's possible that memory training is not done yet. In this case > > we > > + * defer the detection to be performed at a later point in time. > > + */ > > + if (!dimm_mask) > > + return -EAGAIN; > > + > > + dev_dbg(priv->dev, "Scanned populated DIMMs: %#llx\n", dimm_mask); > > Hmm, though aside from this one debug print it seems like this function > could just as easily operate directly on priv->dimm_mask if we wanted to > make it safe for >64 dimms (I have no particular objection to keeping it > as-is for now though). I'll leave it as is for now. Thanks -Iwona