On Tue, Apr 09, 2019 at 04:35:25PM -0500, Alan Tull wrote: > On Sun, Mar 24, 2019 at 10:24 PM Wu Hao <hao.wu@xxxxxxxxx> wrote: > > Hi Hao, > > > > > This patch adds support for global error reporting for FPGA > > Management Engine (FME), it introduces sysfs interfaces to > > report different error detected by the hardware, and allow > > user to clear errors or inject error for testing purpose. > > > > Signed-off-by: Luwei Kang <luwei.kang@xxxxxxxxx> > > Signed-off-by: Ananda Ravuri <ananda.ravuri@xxxxxxxxx> > > Signed-off-by: Xu Yilun <yilun.xu@xxxxxxxxx> > > Signed-off-by: Wu Hao <hao.wu@xxxxxxxxx> > > --- > > Documentation/ABI/testing/sysfs-platform-dfl-fme | 58 ++++ > > drivers/fpga/Makefile | 2 +- > > drivers/fpga/dfl-fme-error.c | 390 +++++++++++++++++++++++ > > drivers/fpga/dfl-fme-main.c | 4 + > > drivers/fpga/dfl-fme.h | 2 + > > drivers/fpga/dfl.h | 2 + > > 6 files changed, 457 insertions(+), 1 deletion(-) > > create mode 100644 drivers/fpga/dfl-fme-error.c > > > > diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme > > index 4b6448f..38f9cdd 100644 > > --- a/Documentation/ABI/testing/sysfs-platform-dfl-fme > > +++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme > > @@ -156,3 +156,61 @@ KernelVersion: 5.2 > > Contact: Wu Hao <hao.wu@xxxxxxxxx> > > Description: Read-only. Read this file to get power limit for fpga, it > > is only valid for integrated solution. > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Read-only. Read this file to get errors detected by hardware. > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Read-only. Read this file to get the first error detected by > > + hardware. > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Read-only. Read this file to get the second error detected by > > + hardware. > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Write-only. Write error code to this file to clear errors. If > > + the input error code doesn't match, it returns -EBUSY. > > As with the afu errors patch, seems like -EINVAL would be better. > > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Read-only. It returns errors detected on pcie0 link. > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Read-only. It returns errors detected on pcie1 link. > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Read-only. It returns non-fatal errors detected. > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Read-only. It returns catastrophic and fatal errors detected. > > + > > +What: /sys/bus/platform/devices/dfl-fme.0/errors/inject_error > > +Date: March 2019 > > +KernelVersion: 5.2 > > +Contact: Wu Hao <hao.wu@xxxxxxxxx> > > +Description: Read-Write. Write this file to inject errors for testing > > + purpose. Read this file to check errors injected. > > diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile > > index f1f0af7..1a9fa3d 100644 > > --- a/drivers/fpga/Makefile > > +++ b/drivers/fpga/Makefile > > @@ -38,7 +38,7 @@ obj-$(CONFIG_FPGA_DFL_FME_BRIDGE) += dfl-fme-br.o > > obj-$(CONFIG_FPGA_DFL_FME_REGION) += dfl-fme-region.o > > obj-$(CONFIG_FPGA_DFL_AFU) += dfl-afu.o > > > > -dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o > > +dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o > > dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o > > dfl-afu-objs += dfl-afu-error.o > > > > diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c > > new file mode 100644 > > index 0000000..f2bd5f8 > > --- /dev/null > > +++ b/drivers/fpga/dfl-fme-error.c > > @@ -0,0 +1,390 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * Driver for FPGA Management Engine Error Management > > + * > > + * Copyright 2019 Intel Corporation, Inc. > > + * > > + * Authors: > > + * Kang Luwei <luwei.kang@xxxxxxxxx> > > + * Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx> > > + * Wu Hao <hao.wu@xxxxxxxxx> > > + * Joseph Grecco <joe.grecco@xxxxxxxxx> > > + * Enno Luebbers <enno.luebbers@xxxxxxxxx> > > + * Tim Whisonant <tim.whisonant@xxxxxxxxx> > > + * Ananda Ravuri <ananda.ravuri@xxxxxxxxx> > > + * Mitchel, Henry <henry.mitchel@xxxxxxxxx> > > + */ > > + > > +#include <linux/uaccess.h> > > + > > +#include "dfl.h" > > +#include "dfl-fme.h" > > + > > +#define FME_ERROR_MASK 0x8 > > +#define FME_ERROR 0x10 > > +#define MBP_ERROR BIT_ULL(6) > > +#define PCIE0_ERROR_MASK 0x18 > > +#define PCIE0_ERROR 0x20 > > +#define PCIE1_ERROR_MASK 0x28 > > +#define PCIE1_ERROR 0x30 > > +#define FME_FIRST_ERROR 0x38 > > +#define FME_NEXT_ERROR 0x40 > > +#define RAS_NONFAT_ERROR_MASK 0x48 > > +#define RAS_NONFAT_ERROR 0x50 > > +#define RAS_CATFAT_ERROR_MASK 0x58 > > +#define RAS_CATFAT_ERROR 0x60 > > +#define RAS_ERROR_INJECT 0x68 > > +#define INJECT_ERROR_MASK GENMASK_ULL(2, 0) > > + > > +static ssize_t errors_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", > > + (unsigned long long)readq(base + FME_ERROR)); > > +} > > +static DEVICE_ATTR_RO(errors); > > + > > +static ssize_t first_error_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", > > + (unsigned long long)readq(base + FME_FIRST_ERROR)); > > +} > > +static DEVICE_ATTR_RO(first_error); > > + > > +static ssize_t next_error_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", > > + (unsigned long long)readq(base + FME_NEXT_ERROR)); > > +} > > +static DEVICE_ATTR_RO(next_error); > > + > > +static ssize_t clear_store(struct device *dev, struct device_attribute *attr, > > + const char *buf, size_t count) > > +{ > > + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent); > > + struct device *err_dev = dev->parent; > > + struct dfl_feature *feature; > > + void __iomem *base; > > + u64 v, val; > > + int ret; > > + > > + ret = kstrtou64(buf, 0, &val); > > + if (ret) > > + return ret; > > from kstrtox.c: > * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. > > Your sysfs doc could be updated to explain these return codes. Sure, will fix this. > > > + > > + feature = dfl_get_feature_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + base = feature->ioaddr; > > + > > + mutex_lock(&pdata->lock); > > + writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK); > > + > > + v = readq(base + FME_ERROR); > > + if (val != v) { > > + ret = -EINVAL; > > Oh wait, that's what I thought it should be ;) so the doc just needs to change. Sure, will capture the detailed return values in doc. > > > + goto done; > > It would be easy to avoid using 'goto' here. Yes, agree. > > > + } > > + > > + writeq(v, base + FME_ERROR); > > + v = readq(base + FME_FIRST_ERROR); > > + writeq(v, base + FME_FIRST_ERROR); > > + v = readq(base + FME_NEXT_ERROR); > > + writeq(v, base + FME_NEXT_ERROR); > > + > > +done: > > + /* Workaround: disable MBP_ERROR if feature revision is 0 */ > > + writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR, > > + base + FME_ERROR_MASK); > > + mutex_unlock(&pdata->lock); > > + return ret ? ret : count; > > +} > > +static DEVICE_ATTR_WO(clear); > > + > > +static ssize_t revision_show(struct device *dev, struct device_attribute *attr, > > + char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + return scnprintf(buf, PAGE_SIZE, "%u\n", dfl_feature_revision(base)); > > +} > > +static DEVICE_ATTR_RO(revision); > > The revision attr to be documented in the sysfs doc. Will fix this. > > > + > > +static ssize_t pcie0_errors_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", > > + (unsigned long long)readq(base + PCIE0_ERROR)); > > +} > > + > > +static ssize_t pcie0_errors_store(struct device *dev, > > The sysfs doc shows this as read-only. > > > + struct device_attribute *attr, > > + const char *buf, size_t count) > > +{ > > + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent); > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + u64 v, val; > > + int ret; > > + > > + ret = kstrtou64(buf, 0, &val); > > + if (ret) > > + return ret; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + mutex_lock(&pdata->lock); > > + writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK); > > + > > + v = readq(base + PCIE0_ERROR); > > + if (val != v) > > + ret = -EBUSY; > > + else > > + writeq(v, base + PCIE0_ERROR); > > + > > + writeq(0ULL, base + PCIE0_ERROR_MASK); > > + mutex_unlock(&pdata->lock); > > + return ret ? ret : count; > > +} > > +static DEVICE_ATTR_RW(pcie0_errors); > > + > > +static ssize_t pcie1_errors_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", > > + (unsigned long long)readq(base + PCIE1_ERROR)); > > +} > > + > > +static ssize_t pcie1_errors_store(struct device *dev, > > + struct device_attribute *attr, > > + const char *buf, size_t count) > > Same here, documentation needs to show as R/W and explain what happens > when written. Sorry, will fix them in doc. > > > +{ > > + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent); > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + u64 v, val; > > + int ret; > > + > > + ret = kstrtou64(buf, 0, &val); > > + if (ret) > > + return ret; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + mutex_lock(&pdata->lock); > > + writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK); > > + > > + v = readq(base + PCIE1_ERROR); > > + if (val != v) > > + ret = -EBUSY; > > + else > > + writeq(v, base + PCIE1_ERROR); > > + > > + writeq(0ULL, base + PCIE1_ERROR_MASK); > > + mutex_unlock(&pdata->lock); > > + return ret ? ret : count; > > +} > > +static DEVICE_ATTR_RW(pcie1_errors); > > + > > +static ssize_t nonfatal_errors_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", > > + (unsigned long long)readq(base + RAS_NONFAT_ERROR)); > > +} > > +static DEVICE_ATTR_RO(nonfatal_errors); > > + > > +static ssize_t catfatal_errors_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", > > + (unsigned long long)readq(base + RAS_CATFAT_ERROR)); > > +} > > +static DEVICE_ATTR_RO(catfatal_errors); > > + > > +static ssize_t inject_error_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + u64 v; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + v = readq(base + RAS_ERROR_INJECT); > > + > > + return scnprintf(buf, PAGE_SIZE, "0x%llx\n", > > + (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v)); > > +} > > + > > +static ssize_t inject_error_store(struct device *dev, > > + struct device_attribute *attr, > > + const char *buf, size_t count) > > +{ > > + struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent); > > + struct device *err_dev = dev->parent; > > + void __iomem *base; > > + u8 inject_error; > > + int ret; > > + u64 v; > > + > > + ret = kstrtou8(buf, 0, &inject_error); > > + if (ret) > > + return ret; > > + > > + if (inject_error & ~INJECT_ERROR_MASK) > > + return -EINVAL; > > + > > + base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR); > > + > > + mutex_lock(&pdata->lock); > > + v = readq(base + RAS_ERROR_INJECT); > > + v &= ~INJECT_ERROR_MASK; > > + v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error); > > + writeq(v, base + RAS_ERROR_INJECT); > > + mutex_unlock(&pdata->lock); > > + > > + return count; > > +} > > +static DEVICE_ATTR_RW(inject_error); > > + > > +static struct attribute *fme_errors_attrs[] = { > > + &dev_attr_errors.attr, > > + &dev_attr_first_error.attr, > > + &dev_attr_next_error.attr, > > + &dev_attr_clear.attr, > > + NULL, > > +}; > > + > > +static struct attribute_group fme_errors_attr_group = { > > + .attrs = fme_errors_attrs, > > + .name = "fme-errors", > > +}; > > + > > +static struct attribute *errors_attrs[] = { > > + &dev_attr_revision.attr, > > + &dev_attr_pcie0_errors.attr, > > + &dev_attr_pcie1_errors.attr, > > + &dev_attr_nonfatal_errors.attr, > > + &dev_attr_catfatal_errors.attr, > > + &dev_attr_inject_error.attr, > > + NULL, > > +}; > > + > > +static struct attribute_group errors_attr_group = { > > + .attrs = errors_attrs, > > +}; > > + > > +static const struct attribute_group *error_groups[] = { > > + &fme_errors_attr_group, > > + &errors_attr_group, > > + NULL > > +}; > > + > > +static void fme_error_enable(struct dfl_feature *feature) > > +{ > > + void __iomem *base = feature->ioaddr; > > + > > + /* Workaround: disable MBP_ERROR if revision is 0 */ > > + writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR, > > + base + FME_ERROR_MASK); > > + writeq(0ULL, base + PCIE0_ERROR_MASK); > > + writeq(0ULL, base + PCIE1_ERROR_MASK); > > + writeq(0ULL, base + RAS_NONFAT_ERROR_MASK); > > + writeq(0ULL, base + RAS_CATFAT_ERROR_MASK); > > +} > > + > > +static void err_dev_release(struct device *dev) > > +{ > > + kfree(dev); > > +} > > + > > +static int fme_global_err_init(struct platform_device *pdev, > > + struct dfl_feature *feature) > > +{ > > + struct device *dev; > > + int ret = 0; > > + > > + dev = kzalloc(sizeof(*dev), GFP_KERNEL); > > + if (!dev) > > + return -ENOMEM; > > + > > + dev->parent = &pdev->dev; > > + dev->release = err_dev_release; > > + dev_set_name(dev, "errors"); > > + > > + fme_error_enable(feature); > > + > > + ret = device_register(dev); > > + if (ret) { > > + put_device(dev); > > + return ret; > > + } > > + > > + ret = sysfs_create_groups(&dev->kobj, error_groups); > > + if (ret) { > > + device_unregister(dev); > > + return ret; > > + } > > + > > + feature->priv = dev; > > + > > + return ret; > > +} > > + > > +static void fme_global_err_uinit(struct platform_device *pdev, > > + struct dfl_feature *feature) > > +{ > > + struct device *dev = feature->priv; > > + > > + sysfs_remove_groups(&dev->kobj, error_groups); > > + device_unregister(dev); > > +} > > + > > +const struct dfl_feature_id fme_global_err_id_table[] = { > > + {.id = FME_FEATURE_ID_GLOBAL_ERR,}, > > + {0,} > > +}; > > + > > +const struct dfl_feature_ops fme_global_err_ops = { > > + .init = fme_global_err_init, > > + .uinit = fme_global_err_uinit, > > +}; > > diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c > > index dafa6580..76cb112 100644 > > --- a/drivers/fpga/dfl-fme-main.c > > +++ b/drivers/fpga/dfl-fme-main.c > > @@ -686,6 +686,10 @@ static struct dfl_feature_driver fme_feature_drvs[] = { > > .ops = &fme_power_mgmt_ops, > > }, > > { > > + .id_table = fme_global_err_id_table, > > + .ops = &fme_global_err_ops, > > + }, > > + { > > .ops = NULL, > > }, > > }; > > diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h > > index 7a021c4..5fbe3f5 100644 > > --- a/drivers/fpga/dfl-fme.h > > +++ b/drivers/fpga/dfl-fme.h > > @@ -37,5 +37,7 @@ struct dfl_fme { > > > > extern const struct dfl_feature_ops fme_pr_mgmt_ops; > > extern const struct dfl_feature_id fme_pr_mgmt_id_table[]; > > +extern const struct dfl_feature_ops fme_global_err_ops; > > +extern const struct dfl_feature_id fme_global_err_id_table[]; > > > > #endif /* __DFL_FME_H */ > > diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h > > index fbc57f0..6c32080 100644 > > --- a/drivers/fpga/dfl.h > > +++ b/drivers/fpga/dfl.h > > @@ -197,12 +197,14 @@ struct dfl_feature_driver { > > * feature dev (platform device)'s reources. > > * @ioaddr: mapped mmio resource address. > > * @ops: ops of this sub feature. > > + * @priv: priv data of this feature. > > */ > > struct dfl_feature { > > u64 id; > > int resource_index; > > void __iomem *ioaddr; > > const struct dfl_feature_ops *ops; > > + void *priv; > > }; > > > > #define DEV_STATUS_IN_USE 0 > > -- > > 2.7.4 > > Thanks for the review, will fix these things in the next version. Hao > > Thanks, > Alan