The following commit has been merged into the perf/core branch of tip: Commit-ID: 2d17cf1abcbe8a45b7dc41a768ed22aac158ddd8 Gitweb: https://git.kernel.org/tip/2d17cf1abcbe8a45b7dc41a768ed22aac158ddd8 Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx> AuthorDate: Wed, 07 Aug 2024 13:29:25 +02:00 Committer: Peter Zijlstra <peterz@xxxxxxxxxxxxx> CommitterDate: Thu, 08 Aug 2024 12:27:31 +02:00 perf: Optimize context reschedule for single PMU cases Currently re-scheduling a context will reschedule all active PMUs for that context, even if it is known only a single event is added. Namhyung reported that changing this to only reschedule the affected PMU when possible provides significant performance gains under certain conditions. Therefore, allow partial context reschedules for a specific PMU, that of the event modified. While the patch looks somewhat noisy, it mostly just propagates a new @pmu argument through the callchain and modifies the epc loop to only pick the 'epc->pmu == @pmu' case. Reported-by: Namhyung Kim <namhyung@xxxxxxxxxx> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Reviewed-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx> Reviewed-by: Namhyung Kim <namhyung@xxxxxxxxxx> Link: https://lore.kernel.org/r/20240807115549.920950699@xxxxxxxxxxxxx --- kernel/events/core.c | 164 ++++++++++++++++++++++-------------------- 1 file changed, 88 insertions(+), 76 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index c01a326..dad2b9a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -685,30 +685,32 @@ do { \ ___p; \ }) +#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ + if (_cgroup && !_epc->nr_cgroups) \ + continue; \ + else if (_pmu && _epc->pmu != _pmu) \ + continue; \ + else + static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); - } } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); - } } -static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); -static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF @@ -865,7 +867,7 @@ static void perf_cgroup_switch(struct task_struct *task) perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@ -877,7 +879,7 @@ static void perf_cgroup_switch(struct task_struct *task) * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -2656,7 +2658,8 @@ static void add_event_to_ctx(struct perf_event *event, } static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) + struct pmu *pmu, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); @@ -2666,18 +2669,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, event_type); + ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* @@ -2695,16 +2699,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ -/* - * XXX: ctx_resched() reschedule entire perf_event_context while adding new - * event to the context or enabling existing event in the context. We can - * probably optimize it by rescheduling only affected pmu_ctx. - */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, - enum event_type_t event_type) + struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); + struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be @@ -2715,10 +2715,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_disable(epc->pmu); + if (task_ctx) { - perf_ctx_disable(task_ctx, false); - task_ctx_sched_out(task_ctx, event_type); + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + task_ctx_sched_out(task_ctx, pmu, event_type); } /* @@ -2729,15 +2733,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - ctx_sched_out(&cpuctx->ctx, event_type); + ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + + perf_event_sched_in(cpuctx, task_ctx, pmu); - perf_event_sched_in(cpuctx, task_ctx); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_enable(epc->pmu); - perf_ctx_enable(&cpuctx->ctx, false); - if (task_ctx) - perf_ctx_enable(task_ctx, false); + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_enable(epc->pmu); + } } void perf_pmu_resched(struct pmu *pmu) @@ -2746,7 +2754,7 @@ void perf_pmu_resched(struct pmu *pmu) struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } @@ -2802,9 +2810,10 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, + get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2948,7 +2957,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (ctx->is_active) - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2957,7 +2966,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, EVENT_TIME); + ctx_sched_in(ctx, NULL, EVENT_TIME); return; } @@ -2966,7 +2975,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, EVENT_TIME); + ctx_sched_in(ctx, NULL, EVENT_TIME); return; } @@ -2974,7 +2983,7 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* @@ -3276,8 +3285,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, perf_pmu_enable(pmu); } +/* + * Be very careful with the @pmu argument since this will change ctx state. + * The @pmu argument works for ctx_resched(), because that is symmetric in + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. + * + * However, if you were to be asymmetrical, you could end up with messed up + * state, eg. ctx->is_active cleared even though most EPCs would still actually + * be active. + */ static void -ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; @@ -3331,11 +3349,8 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) is_active ^= ctx->is_active; /* changed bits */ - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); - } } /* @@ -3579,7 +3594,7 @@ unlock: inside_switch: perf_ctx_sched_task_cb(ctx, false); - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); @@ -3877,29 +3892,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx, merge_sched_in, &can_add_hw); } -static void ctx_groups_sched_in(struct perf_event_context *ctx, - struct perf_event_groups *groups, - bool cgroup) +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { - struct perf_event_pmu_context *pmu_ctx; - - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); - } -} + struct perf_event_context *ctx = pmu_ctx->ctx; -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, - struct pmu *pmu) -{ - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); + if (event_type & EVENT_PINNED) + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + if (event_type & EVENT_FLEXIBLE) + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; @@ -3935,12 +3943,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); + if (is_active & EVENT_PINNED) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + } /* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); + if (is_active & EVENT_FLEXIBLE) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + } } static void perf_event_context_sched_in(struct task_struct *task) @@ -3983,10 +3995,10 @@ static void perf_event_context_sched_in(struct task_struct *task) */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx); + perf_event_sched_in(cpuctx, ctx, NULL); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); @@ -4327,14 +4339,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); - __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) - __pmu_ctx_sched_in(task_epc->ctx, pmu); + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4400,7 +4412,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); @@ -4412,9 +4424,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx, event_type); + ctx_resched(cpuctx, ctx, NULL, event_type); } else { - ctx_sched_in(ctx, EVENT_TIME); + ctx_sched_in(ctx, NULL, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -13202,7 +13214,7 @@ static void perf_event_exit_task_context(struct task_struct *child) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -13751,7 +13763,7 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock);