On Tue, Oct 25, 2022 at 12:39:29PM +0000, Kai Ye wrote: > UACCE adds the hardware error isolation API. Users can configure > the isolation frequency by this sysfs node. UACCE reports the device > isolate state to the user space. If the AER error frequency exceeds > the set value in one hour, the device will be isolated. > You are trying to "reach across" to different types of devices here, why? That feels backwards. Why isn't the device that needs to use this api just create a child class device of this type? > Signed-off-by: Kai Ye <yekai13@xxxxxxxxxx> > --- > drivers/misc/uacce/uacce.c | 145 +++++++++++++++++++++++++++++++++++++ > include/linux/uacce.h | 43 ++++++++++- > 2 files changed, 187 insertions(+), 1 deletion(-) > > diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c > index b70a013139c7..f293fcdcf44f 100644 > --- a/drivers/misc/uacce/uacce.c > +++ b/drivers/misc/uacce/uacce.c > @@ -7,10 +7,100 @@ > #include <linux/slab.h> > #include <linux/uacce.h> > > +#define MAX_ERR_ISOLATE_COUNT 65535 > + > static struct class *uacce_class; > static dev_t uacce_devt; > static DEFINE_XARRAY_ALLOC(uacce_xa); > > +static int cdev_get(struct device *dev, void *data) That's a very odd function, considering it does not "get" anything. And it does not work on cdev structures. > +{ > + struct uacce_device *uacce; > + struct device **t_dev = data; Why are you having to cast this? Why not make it a real pointer? > + > + uacce = container_of(dev, struct uacce_device, dev); > + if (uacce->parent == *t_dev) { > + *t_dev = dev; > + return 1; bool? And what happened to the reference count here? > + } > + > + return 0; > +} > + > +/** > + * dev_to_uacce - Get structure uacce device from its parent device > + * @dev the device > + */ > +struct uacce_device *dev_to_uacce(struct device *dev) > +{ > + struct device **tdev = &dev; > + int ret; > + > + ret = class_for_each_device(uacce_class, NULL, tdev, cdev_get); Ah, you use it here. No, sorry, you can not just walk all devices in the tree and assume this will work. Why do you not already know the device already? > + if (ret) { > + dev = *tdev; > + return container_of(dev, struct uacce_device, dev); > + } > + return NULL; > +} > +EXPORT_SYMBOL_GPL(dev_to_uacce); Where is the reference counting here? And again, horrible global function name :( > + > +/** > + * uacce_hw_err_isolate - Try to set the isolation status of the uacce device > + * according to user's configuration of isolation strategy. > + * @uacce the uacce device > + */ > +int uacce_hw_err_isolate(struct uacce_device *uacce) > +{ > + struct uacce_hw_err *err, *tmp, *hw_err; > + struct uacce_err_isolate *isolate_ctx; > + u32 count = 0; > + > + if (!uacce) > + return -EINVAL; How can this happen? > + > + isolate_ctx = uacce->isolate_ctx; > + > +#define SECONDS_PER_HOUR 3600 We don't already have this in a header file? > + > + /* All the hw errs are processed by PF driver */ > + if (uacce->is_vf || isolate_ctx->is_isolate || > + !isolate_ctx->hw_err_isolate_hz) > + return 0; > + > + hw_err = kzalloc(sizeof(*hw_err), GFP_KERNEL); > + if (!hw_err) > + return -ENOMEM; > + > + hw_err->timestamp = jiffies; > + list_for_each_entry_safe(err, tmp, &isolate_ctx->hw_errs, list) { > + if ((hw_err->timestamp - err->timestamp) / HZ > > + SECONDS_PER_HOUR) { > + list_del(&err->list); > + kfree(err); > + } else { > + count++; > + } > + } > + list_add(&hw_err->list, &isolate_ctx->hw_errs); > + > + if (count >= isolate_ctx->hw_err_isolate_hz) > + isolate_ctx->is_isolate = true; > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(uacce_hw_err_isolate); > + > +static void uacce_hw_err_destroy(struct uacce_device *uacce) > +{ > + struct uacce_hw_err *err, *tmp; > + > + list_for_each_entry_safe(err, tmp, &uacce->isolate_data.hw_errs, list) { > + list_del(&err->list); > + kfree(err); No reference counting at all? > + } > +} > + > /* > * If the parent driver or the device disappears, the queue state is invalid and > * ops are not usable anymore. > @@ -363,12 +453,59 @@ static ssize_t region_dus_size_show(struct device *dev, > uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT); > } > > +static ssize_t isolate_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct uacce_device *uacce = to_uacce_device(dev); > + int ret = UACCE_DEV_NORMAL; > + > + if (uacce->isolate_ctx->is_isolate) > + ret = UACCE_DEV_ISOLATE; > + > + return sysfs_emit(buf, "%d\n", ret); > +} > + > +static ssize_t isolate_strategy_show(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct uacce_device *uacce = to_uacce_device(dev); > + > + return sysfs_emit(buf, "%u\n", uacce->isolate_ctx->hw_err_isolate_hz); > +} > + > +static ssize_t isolate_strategy_store(struct device *dev, > + struct device_attribute *attr, > + const char *buf, size_t count) > +{ > + struct uacce_device *uacce = to_uacce_device(dev); > + unsigned long val; > + > + /* must be set by PF */ > + if (uacce->is_vf) > + return -EPERM; > + > + if (kstrtoul(buf, 0, &val) < 0) > + return -EINVAL; > + > + if (val > MAX_ERR_ISOLATE_COUNT) > + return -EINVAL; > + > + uacce->isolate_ctx->hw_err_isolate_hz = val; > + > + /* After the policy is updated, need to reset the hardware err list */ > + uacce_hw_err_destroy(uacce); > + > + return count; > +} > + > static DEVICE_ATTR_RO(api); > static DEVICE_ATTR_RO(flags); > static DEVICE_ATTR_RO(available_instances); > static DEVICE_ATTR_RO(algorithms); > static DEVICE_ATTR_RO(region_mmio_size); > static DEVICE_ATTR_RO(region_dus_size); > +static DEVICE_ATTR_RO(isolate); > +static DEVICE_ATTR_RW(isolate_strategy); No documentation for these new sysfs files? thanks, greg k-h