This patch should probably be squashed with Tvrtko's PMU enabling patch... Making perf-stat workable with i915 PMU. The major point is that current implementation of i915 PMU exposes global counter rather thatn per-task counters. Thus, required changes are: * Register PMU with .task_ctx_nr=perf_invalid_context * Expose cpumask for the PMU with the single CPU in the mask * Properly support pmu->stop(): it should call pmu->read() * Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE) Examples: perf stat -e i915/rcs0-busy/ workload.sh This should error out with "failed to read counter" because the requested counter can't be queried in per-task mode (which is the case). perf stat -e i915/rcs0-busy/ -a workload.sh perf stat -e i915/rcs0-busy/ -a -C0 workload.sh These 2 commands should give the same result with the correct counter values. Counter value will be queried once in the end of the wrokload. The example output would be: Performance counter stats for 'system wide': 649,547,987 i915/rcs0-busy/ 1.895530680 seconds time elapsed perf stat -e i915/rcs0-busy/ -a -I 100 workload.sh This will query counter perdiodically (each 100ms) and dump output: 0.100108369 4,137,438 i915/rcs0-busy/ i915/rcs0-busy/: 37037414 100149071 100149071 0.200249024 37,037,414 i915/rcs0-busy/ i915/rcs0-busy/: 36935429 100145077 100145077 0.300391916 36,935,429 i915/rcs0-busy/ i915/rcs0-busy/: 34262017 100126136 100126136 0.400518037 34,262,017 i915/rcs0-busy/ i915/rcs0-busy/: 34539960 100126217 100126217 v1: Make pmu.busy_stats a refcounter to avoid busy stats going away with some deleted event. v2: Expose cpumask for i915 PMU to avoid multiple events creation of the same type followed by counter aggregation by perf-stat. v3: Track CPUs getting online/offline to migrate perf context. If (likely) cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be needed to see effect of CPU status tracking. Change-Id: I7d1abe747a4399196e72253f7b66441a6528dbee Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@xxxxxxxxx> Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> cpumask Change-Id: I145f59240b75f2b703e0531ec81af6cd05aae95c Signed-off-by: Dmitry Rogozhkin <dmitry.v.rogozhkin@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.h | 19 +++--- drivers/gpu/drm/i915/i915_pmu.c | 115 +++++++++++++++++++++++++++----- drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +- 3 files changed, 110 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index b59da2c..e629e5e 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2153,6 +2153,16 @@ enum { __I915_NUM_PMU_SAMPLERS }; +struct i915_pmu { + struct hlist_node node; + struct pmu base; + spinlock_t lock; + struct hrtimer timer; + bool timer_enabled; + u64 enable; + u64 sample[__I915_NUM_PMU_SAMPLERS]; +}; + struct drm_i915_private { struct drm_device drm; @@ -2660,14 +2670,7 @@ struct drm_i915_private { int irq; } lpe_audio; - struct { - struct pmu base; - spinlock_t lock; - struct hrtimer timer; - bool timer_enabled; - u64 enable; - u64 sample[__I915_NUM_PMU_SAMPLERS]; - } pmu; + struct i915_pmu pmu; /* * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index bcdf2bc..7bfedb7 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -15,6 +15,8 @@ #define ENGINE_SAMPLE_BITS (16) +static cpumask_t i915_pmu_cpumask = CPU_MASK_NONE; + static u8 engine_config_sample(u64 config) { return config & I915_PMU_SAMPLE_MASK; @@ -285,16 +287,24 @@ static int i915_pmu_event_init(struct perf_event *event) { struct drm_i915_private *i915 = container_of(event->pmu, typeof(*i915), pmu.base); + int cpu; int ret; - /* XXX ideally only want pid == -1 && cpu == -1 */ - if (event->attr.type != event->pmu->type) return -ENOENT; if (has_branch_stack(event)) return -EOPNOTSUPP; + if (event->cpu < 0) + return -EINVAL; + + cpu = cpumask_any_and(&i915_pmu_cpumask, + topology_sibling_cpumask(event->cpu)); + + if (cpu >= nr_cpu_ids) + return -ENODEV; + ret = 0; if (is_engine_event(event)) { ret = engine_event_init(event); @@ -314,6 +324,7 @@ static int i915_pmu_event_init(struct perf_event *event) if (ret) return ret; + event->cpu = cpu; if (!event->parent) event->destroy = i915_pmu_event_destroy; @@ -469,11 +480,16 @@ static void i915_pmu_enable(struct perf_event *event) ns_to_ktime(PERIOD), 0, HRTIMER_MODE_REL_PINNED); i915->pmu.timer_enabled = true; - } else if (is_engine_event(event) && engine_needs_busy_stats(engine) && - !engine->pmu.busy_stats) { - engine->pmu.busy_stats = true; - if (!cancel_delayed_work(&engine->pmu.disable_busy_stats)) - queue_work(i915->wq, &engine->pmu.enable_busy_stats); + } else if (is_engine_event(event) && engine_needs_busy_stats(engine)) { + /* Enable busy stats for the first event demanding it, next + * events just reference counter. So, if some events will go + * away, we will still have busy stats enabled till remaining + * events still use them. + */ + if (engine->pmu.busy_stats++ == 0) { + if (!cancel_delayed_work(&engine->pmu.disable_busy_stats)) + queue_work(i915->wq, &engine->pmu.enable_busy_stats); + } } spin_unlock_irqrestore(&i915->pmu.lock, flags); @@ -495,16 +511,16 @@ static void i915_pmu_disable(struct perf_event *event) enum intel_engine_id id; engine = intel_engine_lookup_user(i915, - engine_event_class(event), - engine_event_instance(event)); + engine_event_class(event), + engine_event_instance(event)); GEM_BUG_ON(!engine); engine->pmu.enable &= ~BIT(engine_event_sample(event)); - if (engine->pmu.busy_stats && - !engine_needs_busy_stats(engine)) { - engine->pmu.busy_stats = false; - queue_delayed_work(i915->wq, - &engine->pmu.disable_busy_stats, - round_jiffies_up_relative(2 * HZ)); + if (!engine_needs_busy_stats(engine)) { + if (engine->pmu.busy_stats && --engine->pmu.busy_stats == 0) { + queue_delayed_work(i915->wq, + &engine->pmu.disable_busy_stats, + round_jiffies_up_relative(2 * HZ)); + } } mask = 0; for_each_engine(engine, i915, id) @@ -529,6 +545,8 @@ static void i915_pmu_event_start(struct perf_event *event, int flags) static void i915_pmu_event_stop(struct perf_event *event, int flags) { + if (flags & PERF_EF_UPDATE) + i915_pmu_event_read(event); i915_pmu_disable(event); } @@ -546,7 +564,7 @@ static int i915_pmu_event_add(struct perf_event *event, int flags) static void i915_pmu_event_del(struct perf_event *event, int flags) { - i915_pmu_disable(event); + i915_pmu_event_stop(event, PERF_EF_UPDATE); } static int i915_pmu_event_event_idx(struct perf_event *event) @@ -656,9 +674,28 @@ static ssize_t i915_pmu_event_show(struct device *dev, .attrs = i915_pmu_events_attrs, }; +static ssize_t i915_pmu_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &i915_pmu_cpumask); +} + +static DEVICE_ATTR(cpumask, S_IRUGO, i915_pmu_get_attr_cpumask, NULL); + +static struct attribute *i915_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group i915_pmu_cpumask_attr_group = { + .attrs = i915_cpumask_attrs, +}; + static const struct attribute_group *i915_pmu_attr_groups[] = { &i915_pmu_format_attr_group, &i915_pmu_events_attr_group, + &i915_pmu_cpumask_attr_group, NULL }; @@ -678,16 +715,57 @@ static void __disable_busy_stats(struct work_struct *work) intel_disable_engine_stats(engine); } +static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + unsigned int target; + + target = cpumask_any_and(&i915_pmu_cpumask, &i915_pmu_cpumask); + /* Select the first online CPU as a designated reader. */ + if (target >= nr_cpu_ids) { + cpumask_set_cpu(cpu, &i915_pmu_cpumask); + } + return 0; +} + +static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) +{ + struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), node); + unsigned int target; + + if (cpumask_test_and_clear_cpu(cpu, &i915_pmu_cpumask)) { + target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ + if (target < nr_cpu_ids) { + cpumask_set_cpu(target, &i915_pmu_cpumask); + perf_pmu_migrate_context(&pmu->base, cpu, target); + } + } + return 0; +} + void i915_pmu_register(struct drm_i915_private *i915) { struct intel_engine_cs *engine; enum intel_engine_id id; + int err; if (INTEL_GEN(i915) <= 2) return; + err = cpuhp_setup_state_multi(CPUHP_AP_PERF_X86_UNCORE_ONLINE, + "perf/x86/intel/i915:online", + i915_pmu_cpu_online, + i915_pmu_cpu_offline); + if (err) + return; + + err = cpuhp_state_add_instance(CPUHP_AP_PERF_X86_UNCORE_ONLINE, + &i915->pmu.node); + if (err) + return; + i915->pmu.base.attr_groups = i915_pmu_attr_groups; - i915->pmu.base.task_ctx_nr = perf_sw_context; + i915->pmu.base.task_ctx_nr = perf_invalid_context; i915->pmu.base.event_init = i915_pmu_event_init; i915->pmu.base.add = i915_pmu_event_add; i915->pmu.base.del = i915_pmu_event_del; @@ -731,4 +809,7 @@ void i915_pmu_unregister(struct drm_i915_private *i915) flush_work(&engine->pmu.enable_busy_stats); flush_delayed_work(&engine->pmu.disable_busy_stats); } + + cpuhp_state_remove_instance(CPUHP_AP_PERF_X86_UNCORE_ONLINE, + &i915->pmu.node); } diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index fd5d838..0530e4e 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -248,7 +248,7 @@ struct intel_engine_cs { struct { u32 enable; u64 sample[4]; - bool busy_stats; + unsigned int busy_stats; struct work_struct enable_busy_stats; struct delayed_work disable_busy_stats; } pmu; -- 1.8.3.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx