Instead of calling perf_pmu_unregister() when unbinding, defer that to the destruction of i915 object. Since perf itself holds a reference in the event, this only happens when all events are gone, which guarantees i915 is not unregistering the pmu with live events. Previously, running the following sequence would crash the system after ~2 tries: 1) bind device to i915 2) wait events to show up on sysfs 3) start perf stat -I 1000 -e i915/rcs0-busy/ 4) unbind driver 5) kill perf Most of the time this crashes in perf_pmu_disable() while accessing the percpu pmu_disable_count. This happens because perf_pmu_unregister() destroys it with free_percpu(pmu->pmu_disable_count). With a lazy unbind, the pmu is only unregistered after (5) as opposed to after (4). The downside is that if a new bind operation is attempted for the same device/driver without killing the perf process, i915 will fail to register the pmu (but still load successfully). This seems better than completely crashing the system. Signed-off-by: Lucas De Marchi <lucas.demarchi@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_pmu.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 8708f905f4f4..df53a8fe53ec 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -1158,18 +1158,21 @@ static void free_pmu(struct drm_device *dev, void *res) struct i915_pmu *pmu = res; struct drm_i915_private *i915 = pmu_to_i915(pmu); + perf_pmu_unregister(&pmu->base); free_event_attributes(pmu); kfree(pmu->base.attr_groups); if (IS_DGFX(i915)) kfree(pmu->name); + + /* + * Make sure all currently running (but shortcut on pmu->closed) are + * gone before proceeding with free'ing the pmu object embedded in i915. + */ + synchronize_rcu(); } static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) { - struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node); - - GEM_BUG_ON(!pmu->base.event_init); - /* Select the first online CPU as a designated reader. */ if (cpumask_empty(&i915_pmu_cpumask)) cpumask_set_cpu(cpu, &i915_pmu_cpumask); @@ -1182,8 +1185,6 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node); unsigned int target = i915_pmu_target_cpu; - GEM_BUG_ON(!pmu->base.event_init); - /* * Unregistering an instance generates a CPU offline event which we must * ignore to avoid incorrectly modifying the shared i915_pmu_cpumask. @@ -1337,21 +1338,14 @@ void i915_pmu_unregister(struct drm_i915_private *i915) { struct i915_pmu *pmu = &i915->pmu; - if (!pmu->base.event_init) - return; - /* - * "Disconnect" the PMU callbacks - since all are atomic synchronize_rcu - * ensures all currently executing ones will have exited before we - * proceed with unregistration. + * "Disconnect" the PMU callbacks - unregistering the pmu will be done + * later when all currently open events are gone */ pmu->closed = true; - synchronize_rcu(); hrtimer_cancel(&pmu->timer); - i915_pmu_unregister_cpuhp_state(pmu); - perf_pmu_unregister(&pmu->base); pmu->base.event_init = NULL; } -- 2.43.0