This patch should probably be squashed with Tvrtko's PMU enabling patch... Making perf-stat workable with i915 PMU. The major point is that current implementation of i915 PMU exposes global counter rather thatn per-task counters. Thus, required changes are: * Register PMU with .task_ctx_nr=perf_invalid_context * Expose cpumask for the PMU with the single CPU in the mask * Properly support pmu->stop(): it should call pmu->read() * Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE) Examples: perf stat -e i915/rcs0-busy/ workload.sh This should error out with "failed to read counter" because the requested counter can't be queried in per-task mode (which is the case). perf stat -e i915/rcs0-busy/ -a workload.sh perf stat -e i915/rcs0-busy/ -a -C0 workload.sh These 2 commands should give the same result with the correct counter values. Counter value will be queried once in the end of the wrokload. The example output would be: Performance counter stats for 'system wide': 649,547,987 i915/rcs0-busy/ 1.895530680 seconds time elapsed perf stat -e i915/rcs0-busy/ -a -I 100 workload.sh This will query counter perdiodically (each 100ms) and dump output: 0.100108369 4,137,438 i915/rcs0-busy/ i915/rcs0-busy/: 37037414 100149071 100149071 0.200249024 37,037,414 i915/rcs0-busy/ i915/rcs0-busy/: 36935429 100145077 100145077 0.300391916 36,935,429 i915/rcs0-busy/ i915/rcs0-busy/: 34262017 100126136 100126136 0.400518037 34,262,017 i915/rcs0-busy/ i915/rcs0-busy/: 34539960 100126217 100126217 v1: Make pmu.busy_stats a refcounter to avoid busy stats going away with some deleted event. v2: Expose cpumask for i915 PMU to avoid multiple events creation of the same type followed by counter aggregation by perf-stat. Change-Id: I7d1abe747a4399196e72253f7b66441a6528dbee Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@xxxxxxxxx> Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_pmu.c | 71 +++++++++++++++++++++++++-------- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +- 2 files changed, 55 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index bcdf2bc..c551d64 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -15,6 +15,8 @@ #define ENGINE_SAMPLE_BITS (16) +static cpumask_t i915_pmu_cpumask = CPU_MASK_CPU0; + static u8 engine_config_sample(u64 config) { return config & I915_PMU_SAMPLE_MASK; @@ -285,16 +287,24 @@ static int i915_pmu_event_init(struct perf_event *event) { struct drm_i915_private *i915 = container_of(event->pmu, typeof(*i915), pmu.base); + int cpu; int ret; - /* XXX ideally only want pid == -1 && cpu == -1 */ - if (event->attr.type != event->pmu->type) return -ENOENT; if (has_branch_stack(event)) return -EOPNOTSUPP; + if (event->cpu < 0) + return -EINVAL; + + cpu = cpumask_any_and(&i915_pmu_cpumask, + topology_sibling_cpumask(event->cpu)); + + if (cpu >= nr_cpu_ids) + return -ENODEV; + ret = 0; if (is_engine_event(event)) { ret = engine_event_init(event); @@ -314,6 +324,7 @@ static int i915_pmu_event_init(struct perf_event *event) if (ret) return ret; + event->cpu = cpu; if (!event->parent) event->destroy = i915_pmu_event_destroy; @@ -469,11 +480,16 @@ static void i915_pmu_enable(struct perf_event *event) ns_to_ktime(PERIOD), 0, HRTIMER_MODE_REL_PINNED); i915->pmu.timer_enabled = true; - } else if (is_engine_event(event) && engine_needs_busy_stats(engine) && - !engine->pmu.busy_stats) { - engine->pmu.busy_stats = true; - if (!cancel_delayed_work(&engine->pmu.disable_busy_stats)) - queue_work(i915->wq, &engine->pmu.enable_busy_stats); + } else if (is_engine_event(event) && engine_needs_busy_stats(engine)) { + /* Enable busy stats for the first event demanding it, next + * events just reference counter. So, if some events will go + * away, we will still have busy stats enabled till remaining + * events still use them. + */ + if (engine->pmu.busy_stats++ == 0) { + if (!cancel_delayed_work(&engine->pmu.disable_busy_stats)) + queue_work(i915->wq, &engine->pmu.enable_busy_stats); + } } spin_unlock_irqrestore(&i915->pmu.lock, flags); @@ -495,16 +511,16 @@ static void i915_pmu_disable(struct perf_event *event) enum intel_engine_id id; engine = intel_engine_lookup_user(i915, - engine_event_class(event), - engine_event_instance(event)); + engine_event_class(event), + engine_event_instance(event)); GEM_BUG_ON(!engine); engine->pmu.enable &= ~BIT(engine_event_sample(event)); - if (engine->pmu.busy_stats && - !engine_needs_busy_stats(engine)) { - engine->pmu.busy_stats = false; - queue_delayed_work(i915->wq, - &engine->pmu.disable_busy_stats, - round_jiffies_up_relative(2 * HZ)); + if (!engine_needs_busy_stats(engine)) { + if (engine->pmu.busy_stats && --engine->pmu.busy_stats == 0) { + queue_delayed_work(i915->wq, + &engine->pmu.disable_busy_stats, + round_jiffies_up_relative(2 * HZ)); + } } mask = 0; for_each_engine(engine, i915, id) @@ -529,6 +545,8 @@ static void i915_pmu_event_start(struct perf_event *event, int flags) static void i915_pmu_event_stop(struct perf_event *event, int flags) { + if (flags & PERF_EF_UPDATE) + i915_pmu_event_read(event); i915_pmu_disable(event); } @@ -546,7 +564,7 @@ static int i915_pmu_event_add(struct perf_event *event, int flags) static void i915_pmu_event_del(struct perf_event *event, int flags) { - i915_pmu_disable(event); + i915_pmu_event_stop(event, PERF_EF_UPDATE); } static int i915_pmu_event_event_idx(struct perf_event *event) @@ -656,9 +674,28 @@ static ssize_t i915_pmu_event_show(struct device *dev, .attrs = i915_pmu_events_attrs, }; +static ssize_t i915_pmu_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &i915_pmu_cpumask); +} + +static DEVICE_ATTR(cpumask, S_IRUGO, i915_pmu_get_attr_cpumask, NULL); + +static struct attribute *i915_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group i915_pmu_cpumask_attr_group = { + .attrs = i915_cpumask_attrs, +}; + static const struct attribute_group *i915_pmu_attr_groups[] = { &i915_pmu_format_attr_group, &i915_pmu_events_attr_group, + &i915_pmu_cpumask_attr_group, NULL }; @@ -687,7 +724,7 @@ void i915_pmu_register(struct drm_i915_private *i915) return; i915->pmu.base.attr_groups = i915_pmu_attr_groups; - i915->pmu.base.task_ctx_nr = perf_sw_context; + i915->pmu.base.task_ctx_nr = perf_invalid_context; i915->pmu.base.event_init = i915_pmu_event_init; i915->pmu.base.add = i915_pmu_event_add; i915->pmu.base.del = i915_pmu_event_del; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index fd5d838..0530e4e 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -248,7 +248,7 @@ struct intel_engine_cs { struct { u32 enable; u64 sample[4]; - bool busy_stats; + unsigned int busy_stats; struct work_struct enable_busy_stats; struct delayed_work disable_busy_stats; } pmu; -- 1.8.3.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx