Re: [PATCH 16/17] fpga: dfl: fme: add global error reporting support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, Mar 24, 2019 at 10:24 PM Wu Hao <hao.wu@xxxxxxxxx> wrote:

Hi Hao,

>
> This patch adds support for global error reporting for FPGA
> Management Engine (FME), it introduces sysfs interfaces to
> report different error detected by the hardware, and allow
> user to clear errors or inject error for testing purpose.
>
> Signed-off-by: Luwei Kang <luwei.kang@xxxxxxxxx>
> Signed-off-by: Ananda Ravuri <ananda.ravuri@xxxxxxxxx>
> Signed-off-by: Xu Yilun <yilun.xu@xxxxxxxxx>
> Signed-off-by: Wu Hao <hao.wu@xxxxxxxxx>
> ---
>  Documentation/ABI/testing/sysfs-platform-dfl-fme |  58 ++++
>  drivers/fpga/Makefile                            |   2 +-
>  drivers/fpga/dfl-fme-error.c                     | 390 +++++++++++++++++++++++
>  drivers/fpga/dfl-fme-main.c                      |   4 +
>  drivers/fpga/dfl-fme.h                           |   2 +
>  drivers/fpga/dfl.h                               |   2 +
>  6 files changed, 457 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/fpga/dfl-fme-error.c
>
> diff --git a/Documentation/ABI/testing/sysfs-platform-dfl-fme b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> index 4b6448f..38f9cdd 100644
> --- a/Documentation/ABI/testing/sysfs-platform-dfl-fme
> +++ b/Documentation/ABI/testing/sysfs-platform-dfl-fme
> @@ -156,3 +156,61 @@ KernelVersion:  5.2
>  Contact:       Wu Hao <hao.wu@xxxxxxxxx>
>  Description:   Read-only. Read this file to get power limit for fpga, it
>                 is only valid for integrated solution.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Read-only. Read this file to get errors detected by hardware.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/first_error
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Read-only. Read this file to get the first error detected by
> +               hardware.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/next_error
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Read-only. Read this file to get the second error detected by
> +               hardware.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/fme-errors/clear
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Write-only. Write error code to this file to clear errors. If
> +               the input error code doesn't match, it returns -EBUSY.

As with the afu errors patch, seems like -EINVAL would be better.

> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/pcie0_errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Read-only. It returns errors detected on pcie0 link.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/pcie1_errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Read-only. It returns errors detected on pcie1 link.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/nonfatal_errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Read-only. It returns non-fatal errors detected.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/catfatal_errors
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Read-only. It returns catastrophic and fatal errors detected.
> +
> +What:          /sys/bus/platform/devices/dfl-fme.0/errors/inject_error
> +Date:          March 2019
> +KernelVersion:  5.2
> +Contact:       Wu Hao <hao.wu@xxxxxxxxx>
> +Description:   Read-Write. Write this file to inject errors for testing
> +               purpose. Read this file to check errors injected.
> diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
> index f1f0af7..1a9fa3d 100644
> --- a/drivers/fpga/Makefile
> +++ b/drivers/fpga/Makefile
> @@ -38,7 +38,7 @@ obj-$(CONFIG_FPGA_DFL_FME_BRIDGE)     += dfl-fme-br.o
>  obj-$(CONFIG_FPGA_DFL_FME_REGION)      += dfl-fme-region.o
>  obj-$(CONFIG_FPGA_DFL_AFU)             += dfl-afu.o
>
> -dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o
> +dfl-fme-objs := dfl-fme-main.o dfl-fme-pr.o dfl-fme-error.o
>  dfl-afu-objs := dfl-afu-main.o dfl-afu-region.o dfl-afu-dma-region.o
>  dfl-afu-objs += dfl-afu-error.o
>
> diff --git a/drivers/fpga/dfl-fme-error.c b/drivers/fpga/dfl-fme-error.c
> new file mode 100644
> index 0000000..f2bd5f8
> --- /dev/null
> +++ b/drivers/fpga/dfl-fme-error.c
> @@ -0,0 +1,390 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Driver for FPGA Management Engine Error Management
> + *
> + * Copyright 2019 Intel Corporation, Inc.
> + *
> + * Authors:
> + *   Kang Luwei <luwei.kang@xxxxxxxxx>
> + *   Xiao Guangrong <guangrong.xiao@xxxxxxxxxxxxxxx>
> + *   Wu Hao <hao.wu@xxxxxxxxx>
> + *   Joseph Grecco <joe.grecco@xxxxxxxxx>
> + *   Enno Luebbers <enno.luebbers@xxxxxxxxx>
> + *   Tim Whisonant <tim.whisonant@xxxxxxxxx>
> + *   Ananda Ravuri <ananda.ravuri@xxxxxxxxx>
> + *   Mitchel, Henry <henry.mitchel@xxxxxxxxx>
> + */
> +
> +#include <linux/uaccess.h>
> +
> +#include "dfl.h"
> +#include "dfl-fme.h"
> +
> +#define FME_ERROR_MASK         0x8
> +#define FME_ERROR              0x10
> +#define MBP_ERROR              BIT_ULL(6)
> +#define PCIE0_ERROR_MASK       0x18
> +#define PCIE0_ERROR            0x20
> +#define PCIE1_ERROR_MASK       0x28
> +#define PCIE1_ERROR            0x30
> +#define FME_FIRST_ERROR                0x38
> +#define FME_NEXT_ERROR         0x40
> +#define RAS_NONFAT_ERROR_MASK  0x48
> +#define RAS_NONFAT_ERROR       0x50
> +#define RAS_CATFAT_ERROR_MASK  0x58
> +#define RAS_CATFAT_ERROR       0x60
> +#define RAS_ERROR_INJECT       0x68
> +#define INJECT_ERROR_MASK      GENMASK_ULL(2, 0)
> +
> +static ssize_t errors_show(struct device *dev,
> +                          struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + FME_ERROR));
> +}
> +static DEVICE_ATTR_RO(errors);
> +
> +static ssize_t first_error_show(struct device *dev,
> +                               struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + FME_FIRST_ERROR));
> +}
> +static DEVICE_ATTR_RO(first_error);
> +
> +static ssize_t next_error_show(struct device *dev,
> +                              struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + FME_NEXT_ERROR));
> +}
> +static DEVICE_ATTR_RO(next_error);
> +
> +static ssize_t clear_store(struct device *dev, struct device_attribute *attr,
> +                          const char *buf, size_t count)
> +{
> +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> +       struct device *err_dev = dev->parent;
> +       struct dfl_feature *feature;
> +       void __iomem *base;
> +       u64 v, val;
> +       int ret;
> +
> +       ret = kstrtou64(buf, 0, &val);
> +       if (ret)
> +               return ret;

from kstrtox.c:
* Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.

Your sysfs doc could be updated to explain these return codes.

> +
> +       feature = dfl_get_feature_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +       base = feature->ioaddr;
> +
> +       mutex_lock(&pdata->lock);
> +       writeq(GENMASK_ULL(63, 0), base + FME_ERROR_MASK);
> +
> +       v = readq(base + FME_ERROR);
> +       if (val != v) {
> +               ret = -EINVAL;

Oh wait, that's what I thought it should be ;)  so the doc just needs to change.

> +               goto done;

It would be easy to avoid using 'goto' here.

> +       }
> +
> +       writeq(v, base + FME_ERROR);
> +       v = readq(base + FME_FIRST_ERROR);
> +       writeq(v, base + FME_FIRST_ERROR);
> +       v = readq(base + FME_NEXT_ERROR);
> +       writeq(v, base + FME_NEXT_ERROR);
> +
> +done:
> +       /* Workaround: disable MBP_ERROR if feature revision is 0 */
> +       writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> +              base + FME_ERROR_MASK);
> +       mutex_unlock(&pdata->lock);
> +       return ret ? ret : count;
> +}
> +static DEVICE_ATTR_WO(clear);
> +
> +static ssize_t revision_show(struct device *dev, struct device_attribute *attr,
> +                            char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "%u\n", dfl_feature_revision(base));
> +}
> +static DEVICE_ATTR_RO(revision);

The revision attr to be documented in the sysfs doc.

> +
> +static ssize_t pcie0_errors_show(struct device *dev,
> +                                struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + PCIE0_ERROR));
> +}
> +
> +static ssize_t pcie0_errors_store(struct device *dev,

The sysfs doc shows this as read-only.

> +                                 struct device_attribute *attr,
> +                                 const char *buf, size_t count)
> +{
> +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +       u64 v, val;
> +       int ret;
> +
> +       ret = kstrtou64(buf, 0, &val);
> +       if (ret)
> +               return ret;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       mutex_lock(&pdata->lock);
> +       writeq(GENMASK_ULL(63, 0), base + PCIE0_ERROR_MASK);
> +
> +       v = readq(base + PCIE0_ERROR);
> +       if (val != v)
> +               ret = -EBUSY;
> +       else
> +               writeq(v, base + PCIE0_ERROR);
> +
> +       writeq(0ULL, base + PCIE0_ERROR_MASK);
> +       mutex_unlock(&pdata->lock);
> +       return ret ? ret : count;
> +}
> +static DEVICE_ATTR_RW(pcie0_errors);
> +
> +static ssize_t pcie1_errors_show(struct device *dev,
> +                                struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + PCIE1_ERROR));
> +}
> +
> +static ssize_t pcie1_errors_store(struct device *dev,
> +                                 struct device_attribute *attr,
> +                                 const char *buf, size_t count)

Same here, documentation needs to show as R/W and explain what happens
when written.

> +{
> +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +       u64 v, val;
> +       int ret;
> +
> +       ret = kstrtou64(buf, 0, &val);
> +       if (ret)
> +               return ret;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       mutex_lock(&pdata->lock);
> +       writeq(GENMASK_ULL(63, 0), base + PCIE1_ERROR_MASK);
> +
> +       v = readq(base + PCIE1_ERROR);
> +       if (val != v)
> +               ret = -EBUSY;
> +       else
> +               writeq(v, base + PCIE1_ERROR);
> +
> +       writeq(0ULL, base + PCIE1_ERROR_MASK);
> +       mutex_unlock(&pdata->lock);
> +       return ret ? ret : count;
> +}
> +static DEVICE_ATTR_RW(pcie1_errors);
> +
> +static ssize_t nonfatal_errors_show(struct device *dev,
> +                                   struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + RAS_NONFAT_ERROR));
> +}
> +static DEVICE_ATTR_RO(nonfatal_errors);
> +
> +static ssize_t catfatal_errors_show(struct device *dev,
> +                                   struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)readq(base + RAS_CATFAT_ERROR));
> +}
> +static DEVICE_ATTR_RO(catfatal_errors);
> +
> +static ssize_t inject_error_show(struct device *dev,
> +                                struct device_attribute *attr, char *buf)
> +{
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +       u64 v;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       v = readq(base + RAS_ERROR_INJECT);
> +
> +       return scnprintf(buf, PAGE_SIZE, "0x%llx\n",
> +                        (unsigned long long)FIELD_GET(INJECT_ERROR_MASK, v));
> +}
> +
> +static ssize_t inject_error_store(struct device *dev,
> +                                 struct device_attribute *attr,
> +                                 const char *buf, size_t count)
> +{
> +       struct dfl_feature_platform_data *pdata = dev_get_platdata(dev->parent);
> +       struct device *err_dev = dev->parent;
> +       void __iomem *base;
> +       u8 inject_error;
> +       int ret;
> +       u64 v;
> +
> +       ret = kstrtou8(buf, 0, &inject_error);
> +       if (ret)
> +               return ret;
> +
> +       if (inject_error & ~INJECT_ERROR_MASK)
> +               return -EINVAL;
> +
> +       base = dfl_get_feature_ioaddr_by_id(err_dev, FME_FEATURE_ID_GLOBAL_ERR);
> +
> +       mutex_lock(&pdata->lock);
> +       v = readq(base + RAS_ERROR_INJECT);
> +       v &= ~INJECT_ERROR_MASK;
> +       v |= FIELD_PREP(INJECT_ERROR_MASK, inject_error);
> +       writeq(v, base + RAS_ERROR_INJECT);
> +       mutex_unlock(&pdata->lock);
> +
> +       return count;
> +}
> +static DEVICE_ATTR_RW(inject_error);
> +
> +static struct attribute *fme_errors_attrs[] = {
> +       &dev_attr_errors.attr,
> +       &dev_attr_first_error.attr,
> +       &dev_attr_next_error.attr,
> +       &dev_attr_clear.attr,
> +       NULL,
> +};
> +
> +static struct attribute_group fme_errors_attr_group = {
> +       .attrs  = fme_errors_attrs,
> +       .name   = "fme-errors",
> +};
> +
> +static struct attribute *errors_attrs[] = {
> +       &dev_attr_revision.attr,
> +       &dev_attr_pcie0_errors.attr,
> +       &dev_attr_pcie1_errors.attr,
> +       &dev_attr_nonfatal_errors.attr,
> +       &dev_attr_catfatal_errors.attr,
> +       &dev_attr_inject_error.attr,
> +       NULL,
> +};
> +
> +static struct attribute_group errors_attr_group = {
> +       .attrs  = errors_attrs,
> +};
> +
> +static const struct attribute_group *error_groups[] = {
> +       &fme_errors_attr_group,
> +       &errors_attr_group,
> +       NULL
> +};
> +
> +static void fme_error_enable(struct dfl_feature *feature)
> +{
> +       void __iomem *base = feature->ioaddr;
> +
> +       /* Workaround: disable MBP_ERROR if revision is 0 */
> +       writeq(dfl_feature_revision(feature->ioaddr) ? 0ULL : MBP_ERROR,
> +              base + FME_ERROR_MASK);
> +       writeq(0ULL, base + PCIE0_ERROR_MASK);
> +       writeq(0ULL, base + PCIE1_ERROR_MASK);
> +       writeq(0ULL, base + RAS_NONFAT_ERROR_MASK);
> +       writeq(0ULL, base + RAS_CATFAT_ERROR_MASK);
> +}
> +
> +static void err_dev_release(struct device *dev)
> +{
> +       kfree(dev);
> +}
> +
> +static int fme_global_err_init(struct platform_device *pdev,
> +                              struct dfl_feature *feature)
> +{
> +       struct device *dev;
> +       int ret = 0;
> +
> +       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> +       if (!dev)
> +               return -ENOMEM;
> +
> +       dev->parent = &pdev->dev;
> +       dev->release = err_dev_release;
> +       dev_set_name(dev, "errors");
> +
> +       fme_error_enable(feature);
> +
> +       ret = device_register(dev);
> +       if (ret) {
> +               put_device(dev);
> +               return ret;
> +       }
> +
> +       ret = sysfs_create_groups(&dev->kobj, error_groups);
> +       if (ret) {
> +               device_unregister(dev);
> +               return ret;
> +       }
> +
> +       feature->priv = dev;
> +
> +       return ret;
> +}
> +
> +static void fme_global_err_uinit(struct platform_device *pdev,
> +                                struct dfl_feature *feature)
> +{
> +       struct device *dev = feature->priv;
> +
> +       sysfs_remove_groups(&dev->kobj, error_groups);
> +       device_unregister(dev);
> +}
> +
> +const struct dfl_feature_id fme_global_err_id_table[] = {
> +       {.id = FME_FEATURE_ID_GLOBAL_ERR,},
> +       {0,}
> +};
> +
> +const struct dfl_feature_ops fme_global_err_ops = {
> +       .init = fme_global_err_init,
> +       .uinit = fme_global_err_uinit,
> +};
> diff --git a/drivers/fpga/dfl-fme-main.c b/drivers/fpga/dfl-fme-main.c
> index dafa6580..76cb112 100644
> --- a/drivers/fpga/dfl-fme-main.c
> +++ b/drivers/fpga/dfl-fme-main.c
> @@ -686,6 +686,10 @@ static struct dfl_feature_driver fme_feature_drvs[] = {
>                 .ops = &fme_power_mgmt_ops,
>         },
>         {
> +               .id_table = fme_global_err_id_table,
> +               .ops = &fme_global_err_ops,
> +       },
> +       {
>                 .ops = NULL,
>         },
>  };
> diff --git a/drivers/fpga/dfl-fme.h b/drivers/fpga/dfl-fme.h
> index 7a021c4..5fbe3f5 100644
> --- a/drivers/fpga/dfl-fme.h
> +++ b/drivers/fpga/dfl-fme.h
> @@ -37,5 +37,7 @@ struct dfl_fme {
>
>  extern const struct dfl_feature_ops fme_pr_mgmt_ops;
>  extern const struct dfl_feature_id fme_pr_mgmt_id_table[];
> +extern const struct dfl_feature_ops fme_global_err_ops;
> +extern const struct dfl_feature_id fme_global_err_id_table[];
>
>  #endif /* __DFL_FME_H */
> diff --git a/drivers/fpga/dfl.h b/drivers/fpga/dfl.h
> index fbc57f0..6c32080 100644
> --- a/drivers/fpga/dfl.h
> +++ b/drivers/fpga/dfl.h
> @@ -197,12 +197,14 @@ struct dfl_feature_driver {
>   *                 feature dev (platform device)'s reources.
>   * @ioaddr: mapped mmio resource address.
>   * @ops: ops of this sub feature.
> + * @priv: priv data of this feature.
>   */
>  struct dfl_feature {
>         u64 id;
>         int resource_index;
>         void __iomem *ioaddr;
>         const struct dfl_feature_ops *ops;
> +       void *priv;
>  };
>
>  #define DEV_STATUS_IN_USE      0
> --
> 2.7.4
>

Thanks,
Alan



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux