Re: [PATCH v3 3/4] perf: xgene: Add APM X-Gene SoC Performance Monitoring Unit driver

Mark Rutland <mark.rutland@xxxxxxx> · Thu, 16 Jun 2016 16:39:46 +0100

Hi,

On Thu, Jun 09, 2016 at 06:24:51PM -0700, Tai Nguyen wrote:
> +#include <linux/acpi.h>
> +#include <linux/efi.h>
> +#include <linux/module.h>
> +#include <linux/io.h>
> +#include <linux/clk.h>
> +#include <linux/of_platform.h>
> +#include <linux/of_address.h>
> +#include <linux/of_fdt.h>
> +#include <linux/of_irq.h>
> +#include <linux/interrupt.h>
> +#include <linux/platform_device.h>
> +#include <linux/perf_event.h>
> +#include <linux/module.h>
> +#include <linux/cpumask.h>
> +#include <linux/slab.h>
> +#include <linux/mfd/syscon.h>
> +#include <linux/regmap.h>

Minor nit: please sort these alphabetically (it makes scanning them
by-eye easier and can help when there are conflicts).

[...]

> +static struct dev_ext_attribute l3c_pmu_format_attrs[] = {
> +	PMU_FORMAT_EXT_ATTR(l3c_eventid, "config:0-7"),
> +	PMU_FORMAT_EXT_ATTR(l3c_agentid, "config1:0-9"),
> +};
> +
> +static struct dev_ext_attribute iob_pmu_format_attrs[] = {
> +	PMU_FORMAT_EXT_ATTR(iob_eventid, "config:0-7"),
> +	PMU_FORMAT_EXT_ATTR(iob_agentid, "config1:0-63"),
> +};
> +
> +static struct dev_ext_attribute mcb_pmu_format_attrs[] = {
> +	PMU_FORMAT_EXT_ATTR(mcb_eventid, "config:0-5"),
> +	PMU_FORMAT_EXT_ATTR(mcb_agentid, "config1:0-9"),
> +};
> +
> +static struct dev_ext_attribute mc_pmu_format_attrs[] = {
> +	PMU_FORMAT_EXT_ATTR(mc_eventid, "config:0-28"),
> +};

Please make these structures const.

> +static struct attribute_group pmu_format_attr_group = {
> +	.name = "format",
> +	.attrs = NULL,		/* Filled in xgene_pmu_alloc_attrs */
> +};

I think you should just have a (static const) format_attr_group for each
class, which you reuse and share across instances, rather than bothering
with dynamic allocation (which I think is broken -- more on that below).

To save some pain, I think you can use a compound literal to initialise
each of those in one go:

static const struct attribute_group l3c_pmu_format_attr_group = {
	.name = "format",
	.attrs = (const dev_ext_attribute[]) {
		PMU_FORMAT_EXT_ATTR(l3c_eventid, "config:0-7"),
		PMU_FORMAT_EXT_ATTR(l3c_agentid, "config1:0-9"),
	},
};

You'll need a full pmu attr group for each class also, but you don't
lose much by doing so.

> +
> +/*
> + * sysfs event attributes
> + */
> +static ssize_t _xgene_pmu_event_show(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	struct dev_ext_attribute *eattr;
> +
> +	eattr = container_of(attr, struct dev_ext_attribute, attr);
> +	return sprintf(buf, "config=0x%lx\n", (unsigned long)eattr->var);
> +}
> +
> +#define PMU_EVENT_EXT_ATTR(_name, _config) \
> +	PMU_EXT_ATTR(_name, _xgene_pmu_event_show, (unsigned long)_config)
> +
> +static struct dev_ext_attribute l3c_pmu_events_attrs[] = {
> +	PMU_EVENT_EXT_ATTR(cycle-count,				0x00),
> +	PMU_EVENT_EXT_ATTR(cycle-count-div-64,			0x01),
> +	PMU_EVENT_EXT_ATTR(read-hit,				0x02),
> +	PMU_EVENT_EXT_ATTR(read-miss,				0x03),
> +	PMU_EVENT_EXT_ATTR(write-need-replacement,		0x06),
> +	PMU_EVENT_EXT_ATTR(write-not-need-replacement,		0x07),
> +	PMU_EVENT_EXT_ATTR(tq-full,				0x08),
> +	PMU_EVENT_EXT_ATTR(ackq-full,				0x09),
> +	PMU_EVENT_EXT_ATTR(wdb-full,				0x0a),
> +	PMU_EVENT_EXT_ATTR(bank-fifo-full,			0x0b),
> +	PMU_EVENT_EXT_ATTR(odb-full,				0x0c),
> +	PMU_EVENT_EXT_ATTR(wbq-full,				0x0d),
> +	PMU_EVENT_EXT_ATTR(bank-conflict-fifo-issue,		0x0e),
> +	PMU_EVENT_EXT_ATTR(bank-fifo-issue,			0x0f),
> +};
> +
> +static struct dev_ext_attribute iob_pmu_events_attrs[] = {
> +	PMU_EVENT_EXT_ATTR(cycle-count,				0x00),
> +	PMU_EVENT_EXT_ATTR(cycle-count-div-64,			0x01),
> +	PMU_EVENT_EXT_ATTR(axi0-read,				0x02),
> +	PMU_EVENT_EXT_ATTR(axi0-read-partial,			0x03),
> +	PMU_EVENT_EXT_ATTR(axi1-read,				0x04),
> +	PMU_EVENT_EXT_ATTR(axi1-read-partial,			0x05),
> +	PMU_EVENT_EXT_ATTR(csw-read-block,			0x06),
> +	PMU_EVENT_EXT_ATTR(csw-read-partial,			0x07),
> +	PMU_EVENT_EXT_ATTR(axi0-write,				0x10),
> +	PMU_EVENT_EXT_ATTR(axi0-write-partial,			0x11),
> +	PMU_EVENT_EXT_ATTR(axi1-write,				0x13),
> +	PMU_EVENT_EXT_ATTR(axi1-write-partial,			0x14),
> +	PMU_EVENT_EXT_ATTR(csw-inbound-dirty,			0x16),
> +};
> +
> +static struct dev_ext_attribute mcb_pmu_events_attrs[] = {
> +	PMU_EVENT_EXT_ATTR(cycle-count,				0x00),
> +	PMU_EVENT_EXT_ATTR(cycle-count-div-64,			0x01),
> +	PMU_EVENT_EXT_ATTR(csw-read,				0x02),
> +	PMU_EVENT_EXT_ATTR(csw-write-request,			0x03),
> +	PMU_EVENT_EXT_ATTR(mcb-csw-stall,			0x04),
> +	PMU_EVENT_EXT_ATTR(cancel-read-gack,			0x05),
> +};
> +
> +static struct dev_ext_attribute mc_pmu_events_attrs[] = {
> +	PMU_EVENT_EXT_ATTR(cycle-count,				0x00),
> +	PMU_EVENT_EXT_ATTR(cycle-count-div-64,			0x01),
> +	PMU_EVENT_EXT_ATTR(act-cmd-sent,			0x02),
> +	PMU_EVENT_EXT_ATTR(pre-cmd-sent,			0x03),
> +	PMU_EVENT_EXT_ATTR(rd-cmd-sent,				0x04),
> +	PMU_EVENT_EXT_ATTR(rda-cmd-sent,			0x05),
> +	PMU_EVENT_EXT_ATTR(wr-cmd-sent,				0x06),
> +	PMU_EVENT_EXT_ATTR(wra-cmd-sent,			0x07),
> +	PMU_EVENT_EXT_ATTR(pde-cmd-sent,			0x08),
> +	PMU_EVENT_EXT_ATTR(sre-cmd-sent,			0x09),
> +	PMU_EVENT_EXT_ATTR(prea-cmd-sent,			0x0a),
> +	PMU_EVENT_EXT_ATTR(ref-cmd-sent,			0x0b),
> +	PMU_EVENT_EXT_ATTR(rd-rda-cmd-sent,			0x0c),
> +	PMU_EVENT_EXT_ATTR(wr-wra-cmd-sent,			0x0d),
> +	PMU_EVENT_EXT_ATTR(in-rd-collision,			0x0e),
> +	PMU_EVENT_EXT_ATTR(in-wr-collision,			0x0f),
> +	PMU_EVENT_EXT_ATTR(collision-queue-not-empty,		0x10),
> +	PMU_EVENT_EXT_ATTR(collision-queue-full,		0x11),
> +	PMU_EVENT_EXT_ATTR(mcu-request,				0x12),
> +	PMU_EVENT_EXT_ATTR(mcu-rd-request,			0x13),
> +	PMU_EVENT_EXT_ATTR(mcu-hp-rd-request,			0x14),
> +	PMU_EVENT_EXT_ATTR(mcu-wr-request,			0x15),
> +	PMU_EVENT_EXT_ATTR(mcu-rd-proceed-all,			0x16),
> +	PMU_EVENT_EXT_ATTR(mcu-rd-proceed-cancel,		0x17),
> +	PMU_EVENT_EXT_ATTR(mcu-rd-response,			0x18),
> +	PMU_EVENT_EXT_ATTR(mcu-rd-proceed-speculative-all,	0x19),
> +	PMU_EVENT_EXT_ATTR(mcu-rd-proceed-speculative-cancel,	0x1a),
> +	PMU_EVENT_EXT_ATTR(mcu-wr-proceed-all,			0x1b),
> +	PMU_EVENT_EXT_ATTR(mcu-wr-proceed-cancel,		0x1c),
> +};
> +
> +static struct attribute_group pmu_events_attr_group = {
> +	.name = "events",
> +	.attrs = NULL,		/* Filled in xgene_pmu_alloc_attrs */
> +};

My comments for the format groups apply here too.

> +
> +/*
> + * sysfs cpumask attributes
> + */
> +static ssize_t xgene_pmu_cpumask_show(struct device *dev,
> +				      struct device_attribute *attr, char *buf)
> +{
> +	struct xgene_pmu_dev *pmu_dev = to_pmu_dev(dev_get_drvdata(dev));
> +
> +	return cpumap_print_to_pagebuf(true, buf, &pmu_dev->parent->cpu);
> +}
> +static DEVICE_ATTR(cpumask, S_IRUGO, xgene_pmu_cpumask_show, NULL);
> +
> +static struct attribute *xgene_pmu_cpumask_attrs[] = {
> +	&dev_attr_cpumask.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group pmu_cpumask_attr_group = {
> +	.attrs = xgene_pmu_cpumask_attrs,
> +};

Please make these const.

> +
> +static const struct attribute_group *pmu_attr_groups[] = {
> +	&pmu_format_attr_group,
> +	&pmu_cpumask_attr_group,
> +	&pmu_events_attr_group,
> +	NULL
> +};

As mentioned earlier, you'll need one of these per class.

[...]

> +static int xgene_perf_event_init(struct perf_event *event)
> +{
> +	struct xgene_pmu_dev *pmu_dev = to_pmu_dev(event->pmu);
> +	struct hw_perf_event *hwc = &event->hw;
> +	u64 config, config1;
> +
> +	/* Test the event attr type check for PMU enumeration */
> +	if (event->attr.type != event->pmu->type)
> +		return -ENOENT;
> +
> +	/*
> +	 * SOC PMU counters are shared across all cores.
> +	 * Therefore, it does not support per-process mode.
> +	 * Also, it does not support event sampling mode.
> +	 */
> +	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
> +		return -EINVAL;
> +
> +	/* SOC counters do not have usr/os/guest/host bits */
> +	if (event->attr.exclude_user || event->attr.exclude_kernel ||
> +	    event->attr.exclude_host || event->attr.exclude_guest)
> +		return -EINVAL;
> +
> +	if (event->cpu < 0)
> +		return -EINVAL;
> +	/*
> +	 * Many perf core operations (eg. events rotation) operate on a
> +	 * single CPU context. This is obvious for CPU PMUs, where one
> +	 * expects the same sets of events being observed on all CPUs,
> +	 * but can lead to issues for off-core PMUs, where each
> +	 * event could be theoretically assigned to a different CPU. To
> +	 * mitigate this, we enforce CPU assignment to one, selected
> +	 * processor (the one described in the "cpumask" attribute).
> +	 */
> +	event->cpu = cpumask_first(&pmu_dev->parent->cpu);

I didn't spot a perf_pmu_migrate_context call anywhere.

Do you not plan to handle hotplug?

> +
> +	config = event->attr.config;
> +	config1 = event->attr.config1;
> +
> +	hwc->config = config;
> +	/*
> +	 * Each bit of the config1 field represents an agent from which the
> +	 * request of the event come. The event is counted only if it's caused
> +	 * by a request of an agent has the bit set.
> +	 * By default, the event is counted for all agents.
> +	 */
> +	if (config1)
> +		hwc->extra_reg.config = config1;
> +	else
> +		hwc->extra_reg.config = 0xFFFFFFFFFFFFFFFFULL;
> +

If you treated the bits as having the opposite meaning (i.e. a bit set
means ignore an agent), then the zero default would not have to be
special-cased here.

Currently 0 and 0xFFFFFFFFFFFFFFFFULL mean the same thing, which is a
little odd.

You also need to verify the event grouping, e.g. avoiding cross-pmu
groups. See what we do in the arm-ccn driver.

[...]

> +	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, count)
> +		!= prev_raw_count)
> +		return;

Please leave the '!=' dangling on the first line. It makes it clearer
that there is more to the expression.

[...]

> +static struct attribute **alloc_attrs(struct device *dev, u32 n,
> +				struct dev_ext_attribute *dev_attr)
> +{
> +	struct attribute **attrs;
> +	int i;
> +
> +	/* Alloc n + 1 (for terminating NULL) */
> +	attrs  = devm_kcalloc(dev, n + 1, sizeof(struct attribute *),
> +			GFP_KERNEL);
> +	if (!attrs)
> +		return attrs;
> +
> +	for (i = 0; i < n; i++)
> +		attrs[i] = &dev_attr[i].attr.attr;
> +	return attrs;
> +}
> +
> +static int
> +xgene_pmu_alloc_attrs(struct device *dev, struct xgene_pmu_dev *pmu_dev)
> +{
> +	struct attribute **attrs;
> +
> +	if (pmu_dev->nformat_attrs) {
> +		attrs = alloc_attrs(dev, pmu_dev->nformat_attrs,
> +				pmu_dev->format_attr);
> +		if (!attrs)
> +			return -ENOMEM;
> +		pmu_format_attr_group.attrs = attrs;
> +	}
> +
> +	if (pmu_dev->nevents_attrs) {
> +		attrs = alloc_attrs(dev, pmu_dev->nevents_attrs,
> +				pmu_dev->events_attr);
> +		if (!attrs)
> +			return -ENOMEM;
> +		pmu_events_attr_group.attrs = attrs;
> +	}
> +
> +	pmu_dev->attr_groups = pmu_attr_groups;
> +
> +	return 0;
> +}

I don't see why we need to dynamically allocate anything, given we make
a full copy of each of the existing arrays without modification. We
should just be able to use them as-is.

As I mentioned above, please just statically allocate the structures you
need and share them.

[...]

> +static int
> +xgene_pmu_dev_add(struct xgene_pmu *xgene_pmu, struct xgene_pmu_dev_ctx *ctx)
> +{
> +	struct device *dev = xgene_pmu->dev;
> +	struct xgene_pmu_dev *pmu;
> +	int rc;
> +
> +	pmu = devm_kzalloc(dev, sizeof(*pmu), GFP_KERNEL);
> +	if (!pmu)
> +		return -ENOMEM;
> +	pmu->parent = xgene_pmu;
> +	pmu->inf = &ctx->inf;
> +	ctx->pmu_dev = pmu;
> +
> +	switch (pmu->inf->type) {
> +	case PMU_TYPE_L3C:
> +		pmu->format_attr = l3c_pmu_format_attrs;
> +		pmu->nformat_attrs = ARRAY_SIZE(l3c_pmu_format_attrs);
> +		pmu->events_attr = l3c_pmu_events_attrs;
> +		pmu->nevents_attrs = ARRAY_SIZE(l3c_pmu_events_attrs);
> +		break;
> +	case PMU_TYPE_IOB:
> +		pmu->format_attr = iob_pmu_format_attrs;
> +		pmu->nformat_attrs = ARRAY_SIZE(iob_pmu_format_attrs);
> +		pmu->events_attr = iob_pmu_events_attrs;
> +		pmu->nevents_attrs = ARRAY_SIZE(iob_pmu_events_attrs);
> +		break;
> +	case PMU_TYPE_MCB:
> +		if (!(xgene_pmu->mcb_active_mask & pmu->inf->enable_mask))
> +			goto dev_err;
> +		pmu->format_attr = mcb_pmu_format_attrs;
> +		pmu->nformat_attrs = ARRAY_SIZE(mcb_pmu_format_attrs);
> +		pmu->events_attr = mcb_pmu_events_attrs;
> +		pmu->nevents_attrs = ARRAY_SIZE(mcb_pmu_events_attrs);
> +		break;
> +	case PMU_TYPE_MC:
> +		if (!(xgene_pmu->mc_active_mask & pmu->inf->enable_mask))
> +			goto dev_err;
> +		pmu->format_attr = mc_pmu_format_attrs;
> +		pmu->nformat_attrs = ARRAY_SIZE(mc_pmu_format_attrs);
> +		pmu->events_attr = mc_pmu_events_attrs;
> +		pmu->nevents_attrs = ARRAY_SIZE(mc_pmu_events_attrs);
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	rc = xgene_pmu_alloc_attrs(dev, pmu);
> +	if (rc) {
> +		dev_err(dev, "%s PMU: Failed to alloc attributes\n", ctx->name);
> +		goto dev_err;
> +	}
> +
> +	rc = xgene_init_perf(pmu, ctx->name);
> +	if (rc) {
> +		dev_err(dev, "%s PMU: Failed to init perf driver\n", ctx->name);
> +		goto dev_err;
> +	}
> +
> +	dev_info(dev, "%s PMU registered\n", ctx->name);
> +
> +	/* All attribute allocations can be free'd after perf_register_pmu */
> +	deallocate_attrs(dev, pmu);

As far as I can see, that is not true. The core perf code doesn't make a
copy of the attrs, it just continues to use them as-is. So this looks to
be broken and very unsafe.

Please get rid of the dynamic allocation and copying. 

[...]

> +static irqreturn_t _xgene_pmu_isr(int irq, struct xgene_pmu_dev *pmu_dev)
> +{
> +	struct perf_event *event = NULL;
> +	struct perf_sample_data data;

You never sample, so don't need perf_sample_data.

> +	struct xgene_pmu *xgene_pmu;
> +	struct hw_perf_event *hwc;
> +	int idx;
> +	u32 val;
> +
> +	/* Get interrupt counter source */
> +	val = readl(pmu_dev->inf->csr + PMU_PMOVSR);
> +	idx = ffs(val) - 1;

bit 0 is never set?

What if multiple counters overflow? You should iterate over all the
counters which have overflown (as we do in the arm-ccn driver).

> +	if (!(val & PMU_OVERFLOW_MASK))
> +		goto out;
> +	event = pmu_dev->pmu_counter_event[idx];
> +
> +	/* Ignore if we don't have an event. */
> +	if (!event)
> +		goto out;
> +
> +	hwc = &event->hw;
> +
> +	xgene_perf_event_update(event, hwc, idx);
> +	perf_sample_data_init(&data, 0, hwc->last_period);

You aren't sampling, so don't need this.

> +	if (!xgene_perf_event_set_period(event, hwc, idx))
> +		goto out;
> +
> +out:
> +	/* Clear interrupt flag */
> +	xgene_pmu = pmu_dev->parent;
> +	if (xgene_pmu->version == 1)

It would be better to consistently use the PCP_PMU_V{1,2} defines.

[...]

> +	version = (dev_id == PCP_PMU_V1) ? 1 : 2;

As above, why not set version to the dev_id, and use the PCP_PMU_V*
defines consistently when comparing xgene_pmu::version?

> +	irq = platform_get_irq(pdev, 0);
> +	if (irq < 0) {
> +		dev_err(&pdev->dev, "No IRQ resource\n");
> +		rc = -EINVAL;
> +		goto err;
> +	}
> +	rc = devm_request_irq(&pdev->dev, irq, xgene_pmu_isr, IRQF_SHARED,
> +			      dev_name(&pdev->dev), xgene_pmu);
> +	if (rc) {
> +		dev_err(&pdev->dev, "Could not request IRQ %d\n", irq);
> +		goto err;
> +	}

> +	/* Pick one core to use for cpumask attributes */
> +	cpumask_set_cpu(smp_processor_id(), &xgene_pmu->cpu);

You also need to ensure that the IRQ occurs on this CPU.

Thanks,
Mark.
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html