From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> This reduces the cost of the software engine busyness tracking to a single no-op instruction when there are no listeners. We add a new i915 ordered workqueue to be used only for tasks not needing struct mutex. v2: Rebase and some comments. v3: Rebase. v4: Checkpatch fixes. v5: Rebase. v6: Use system_long_wq to avoid being blocked by struct_mutex users. v7: Fix bad conflict resolution from last rebase. (Dmitry Rogozhkin) v8: Rebase. v9: * Fix race between unordered enable followed by disable. (Chris Wilson) * Prettify order of local variable declarations. (Chris Wilson) v10: Chris Wilson: * Fix workqueue allocation failure handling. * Expand comment about critical workqueue ordering. * Add some GEM_BUG_ONs to document impossible conditions. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.c | 13 +++- drivers/gpu/drm/i915/i915_drv.h | 5 ++ drivers/gpu/drm/i915/i915_pmu.c | 65 ++++++++++++++++++-- drivers/gpu/drm/i915/intel_engine_cs.c | 17 ++++++ drivers/gpu/drm/i915/intel_ringbuffer.h | 101 ++++++++++++++++++++------------ 5 files changed, 158 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index d373a840566a..3853038c6fdd 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -804,12 +804,22 @@ static int i915_workqueues_init(struct drm_i915_private *dev_priv) if (dev_priv->wq == NULL) goto out_err; + /* + * Ordered workqueue for tasks not needing the global mutex but + * which still need to be serialized. + */ + dev_priv->wq_misc = alloc_ordered_workqueue("i915-misc", 0); + if (dev_priv->wq_misc == NULL) + goto out_free_wq; + dev_priv->hotplug.dp_wq = alloc_ordered_workqueue("i915-dp", 0); if (dev_priv->hotplug.dp_wq == NULL) - goto out_free_wq; + goto out_free_wq_misc; return 0; +out_free_wq_misc: + destroy_workqueue(dev_priv->wq_misc); out_free_wq: destroy_workqueue(dev_priv->wq); out_err: @@ -830,6 +840,7 @@ static void i915_engines_cleanup(struct drm_i915_private *i915) static void i915_workqueues_cleanup(struct drm_i915_private *dev_priv) { destroy_workqueue(dev_priv->hotplug.dp_wq); + destroy_workqueue(dev_priv->wq_misc); destroy_workqueue(dev_priv->wq); } diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 2904ac4b2ae5..99989cd778a9 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2356,6 +2356,11 @@ struct drm_i915_private { */ struct workqueue_struct *wq; + /** + * @wq_misc: Workqueue for serialized tasks not needing struct mutex. + */ + struct workqueue_struct *wq_misc; + /* Display functions */ struct drm_i915_display_funcs display; diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 2671e3640639..d552e415becd 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -458,11 +458,20 @@ static void i915_pmu_enable(struct perf_event *event) GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS); GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0); if (engine->pmu.enable_count[sample]++ == 0) { + /* + * Enable engine busy stats tracking if needed or + * alternatively cancel the scheduled disabling of the + * same. + * + * If the delayed disable was pending, cancel it and in + * this case no need to queue a new enable. + */ if (engine_needs_busy_stats(engine) && !engine->pmu.busy_stats) { - engine->pmu.busy_stats = - intel_enable_engine_stats(engine) == 0; - WARN_ON_ONCE(!engine->pmu.busy_stats); + engine->pmu.busy_stats = true; + if (!cancel_delayed_work(&engine->pmu.disable_busy_stats)) + queue_work(i915->wq_misc, + &engine->pmu.enable_busy_stats); } } } @@ -505,7 +514,22 @@ static void i915_pmu_disable(struct perf_event *event) if (!engine_needs_busy_stats(engine) && engine->pmu.busy_stats) { engine->pmu.busy_stats = false; - intel_disable_engine_stats(engine); + /* + * We request a delayed disable to handle the + * rapid on/off cycles on events, which can + * happen when tools like perf stat start, in a + * nicer way. + * + * PMU enable and disable callbacks are + * serialized with the spinlock, and since we do + * them in decoupled fashion from workers, it is + * critical to use an ordered work queue to + * preserve this ordering between the two + * events. + */ + queue_delayed_work(i915->wq_misc, + &engine->pmu.disable_busy_stats, + round_jiffies_up_relative(HZ)); } } } @@ -689,8 +713,26 @@ static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) return 0; } +static void __enable_busy_stats(struct work_struct *work) +{ + struct intel_engine_cs *engine = + container_of(work, typeof(*engine), pmu.enable_busy_stats); + + WARN_ON_ONCE(intel_enable_engine_stats(engine)); +} + +static void __disable_busy_stats(struct work_struct *work) +{ + struct intel_engine_cs *engine = + container_of(work, typeof(*engine), pmu.disable_busy_stats.work); + + intel_disable_engine_stats(engine); +} + void i915_pmu_register(struct drm_i915_private *i915) { + struct intel_engine_cs *engine; + enum intel_engine_id id; int ret; if (INTEL_GEN(i915) <= 2) { @@ -725,6 +767,12 @@ void i915_pmu_register(struct drm_i915_private *i915) i915->pmu.timer.function = i915_sample; i915->pmu.enable = 0; + for_each_engine(engine, i915, id) { + INIT_WORK(&engine->pmu.enable_busy_stats, __enable_busy_stats); + INIT_DELAYED_WORK(&engine->pmu.disable_busy_stats, + __disable_busy_stats); + } + ret = perf_pmu_register(&i915->pmu.base, "i915", -1); if (ret == 0) return; @@ -743,6 +791,9 @@ void i915_pmu_register(struct drm_i915_private *i915) void i915_pmu_unregister(struct drm_i915_private *i915) { + struct intel_engine_cs *engine; + enum intel_engine_id id; + if (!i915->pmu.base.event_init) return; @@ -754,6 +805,12 @@ void i915_pmu_unregister(struct drm_i915_private *i915) hrtimer_cancel(&i915->pmu.timer); + for_each_engine(engine, i915, id) { + GEM_BUG_ON(engine->pmu.busy_stats); + GEM_BUG_ON(flush_work(&engine->pmu.enable_busy_stats)); + flush_delayed_work(&engine->pmu.disable_busy_stats); + } + perf_pmu_unregister(&i915->pmu.base); i915->pmu.base.event_init = NULL; } diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index 51885629d3df..80a7b55184df 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -21,6 +21,7 @@ * IN THE SOFTWARE. * */ +#include <linux/static_key.h> #include <drm/drm_print.h> @@ -1813,6 +1814,10 @@ intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance) return i915->engine_class[class][instance]; } +DEFINE_STATIC_KEY_FALSE(i915_engine_stats_key); +static DEFINE_MUTEX(i915_engine_stats_mutex); +static int i915_engine_stats_ref; + /** * intel_enable_engine_stats() - Enable engine busy tracking on engine * @engine: engine to enable stats collection @@ -1828,6 +1833,8 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine) if (!i915_modparams.enable_execlists) return -ENODEV; + mutex_lock(&i915_engine_stats_mutex); + spin_lock_irqsave(&engine->stats.lock, flags); if (engine->stats.enabled == ~0) goto busy; @@ -1835,10 +1842,16 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine) engine->stats.enabled_at = ktime_get(); spin_unlock_irqrestore(&engine->stats.lock, flags); + if (i915_engine_stats_ref++ == 0) + static_branch_enable(&i915_engine_stats_key); + + mutex_unlock(&i915_engine_stats_mutex); + return 0; busy: spin_unlock_irqrestore(&engine->stats.lock, flags); + mutex_unlock(&i915_engine_stats_mutex); return -EBUSY; } @@ -1856,6 +1869,7 @@ void intel_disable_engine_stats(struct intel_engine_cs *engine) if (!i915_modparams.enable_execlists) return; + mutex_lock(&i915_engine_stats_mutex); spin_lock_irqsave(&engine->stats.lock, flags); WARN_ON_ONCE(engine->stats.enabled == 0); if (--engine->stats.enabled == 0) { @@ -1865,6 +1879,9 @@ void intel_disable_engine_stats(struct intel_engine_cs *engine) engine->stats.total = 0; } spin_unlock_irqrestore(&engine->stats.lock, flags); + if (--i915_engine_stats_ref == 0) + static_branch_disable(&i915_engine_stats_key); + mutex_unlock(&i915_engine_stats_mutex); } /** diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index b7f0cd449824..0202b1fa42fd 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -363,6 +363,22 @@ struct intel_engine_cs { * requested. */ bool busy_stats; + /** + * @enable_busy_stats: Work item for engine busy stats enabling. + * + * Since the action can sleep it needs to be decoupled from the + * perf API callback. + */ + struct work_struct enable_busy_stats; + /** + * @disable_busy_stats: Work item for busy stats disabling. + * + * Same as with @enable_busy_stats action, with the difference + * that we delay it in case there are rapid enable-disable + * actions, which can happen during tool startup (like perf + * stat). + */ + struct delayed_work disable_busy_stats; } pmu; /* @@ -906,59 +922,68 @@ void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *p); struct intel_engine_cs * intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance); +DECLARE_STATIC_KEY_FALSE(i915_engine_stats_key); + static inline void intel_engine_context_in(struct intel_engine_cs *engine) { unsigned long flags; - if (READ_ONCE(engine->stats.enabled) == 0) - return; + if (static_branch_unlikely(&i915_engine_stats_key)) { + if (READ_ONCE(engine->stats.enabled) == 0) + return; - spin_lock_irqsave(&engine->stats.lock, flags); + spin_lock_irqsave(&engine->stats.lock, flags); - if (engine->stats.enabled > 0) { - if (engine->stats.active++ == 0) - engine->stats.start = ktime_get(); - GEM_BUG_ON(engine->stats.active == 0); - } + if (engine->stats.enabled > 0) { + if (engine->stats.active++ == 0) + engine->stats.start = ktime_get(); + GEM_BUG_ON(engine->stats.active == 0); + } - spin_unlock_irqrestore(&engine->stats.lock, flags); + spin_unlock_irqrestore(&engine->stats.lock, flags); + } } static inline void intel_engine_context_out(struct intel_engine_cs *engine) { unsigned long flags; - if (READ_ONCE(engine->stats.enabled) == 0) - return; - - spin_lock_irqsave(&engine->stats.lock, flags); - - if (engine->stats.enabled > 0) { - ktime_t last, now = ktime_get(); - - if (engine->stats.active && --engine->stats.active == 0) { - /* - * Decrement the active context count and in case GPU - * is now idle add up to the running total. - */ - last = ktime_sub(now, engine->stats.start); - - engine->stats.total = ktime_add(engine->stats.total, - last); - } else if (engine->stats.active == 0) { - /* - * After turning on engine stats, context out might be - * the first event in which case we account from the - * time stats gathering was turned on. - */ - last = ktime_sub(now, engine->stats.enabled_at); - - engine->stats.total = ktime_add(engine->stats.total, - last); + if (static_branch_unlikely(&i915_engine_stats_key)) { + if (READ_ONCE(engine->stats.enabled) == 0) + return; + + spin_lock_irqsave(&engine->stats.lock, flags); + + if (engine->stats.enabled > 0) { + ktime_t last, now = ktime_get(); + + if (engine->stats.active && + --engine->stats.active == 0) { + /* + * Decrement the active context count and in + * case GPU is now idle add up to the running + * total. + */ + last = ktime_sub(now, engine->stats.start); + + engine->stats.total = + ktime_add(engine->stats.total, last); + } else if (engine->stats.active == 0) { + /* + * After turning on engine stats, context out + * might be the first event in which case we + * account from the time stats gathering was + * turned on. + */ + last = ktime_sub(now, engine->stats.enabled_at); + + engine->stats.total = + ktime_add(engine->stats.total, last); + } } - } - spin_unlock_irqrestore(&engine->stats.lock, flags); + spin_unlock_irqrestore(&engine->stats.lock, flags); + } } int intel_enable_engine_stats(struct intel_engine_cs *engine); -- 2.9.5 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx