Re: [PATCH] perf/core: Introduce cpuctx->cgrp_ctx_list

Namhyung Kim <namhyung@xxxxxxxxxx> · Mon, 9 Oct 2023 21:57:20 -0700

Hi Peter,

On Mon, Oct 9, 2023 at 2:04 PM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Wed, Oct 04, 2023 at 09:32:24AM -0700, Namhyung Kim wrote:
>
> > Yeah, I know.. but I couldn't come up with a better solution.
>
> Not been near a compiler, and haven't fully thought it through, but
> could something like the work work?

Thanks for the patch, I think it'd work.  Let me test it
and get back to you.

Thanks,
Namhyung

>
>
> ---
>  include/linux/perf_event.h |   1 +
>  kernel/events/core.c       | 115 +++++++++++++++++++++++----------------------
>  2 files changed, 61 insertions(+), 55 deletions(-)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index f31f962a6445..0367d748fae0 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -878,6 +878,7 @@ struct perf_event_pmu_context {
>         unsigned int                    embedded : 1;
>
>         unsigned int                    nr_events;
> +       unsigned int                    nr_cgroups;
>
>         atomic_t                        refcount; /* event <-> epc */
>         struct rcu_head                 rcu_head;
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 708d474c2ede..f3d5d47ecdfc 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -375,6 +375,7 @@ enum event_type_t {
>         EVENT_TIME = 0x4,
>         /* see ctx_resched() for details */
>         EVENT_CPU = 0x8,
> +       EVENT_CGROUP = 0x10,
>         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
>  };
>
> @@ -684,20 +685,26 @@ do {                                                                      \
>         ___p;                                                           \
>  })
>
> -static void perf_ctx_disable(struct perf_event_context *ctx)
> +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
>  {
>         struct perf_event_pmu_context *pmu_ctx;
>
> -       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> +       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +               if (cgroup && !pmu_ctx->nr_cgroups)
> +                       continue;
>                 perf_pmu_disable(pmu_ctx->pmu);
> +       }
>  }
>
> -static void perf_ctx_enable(struct perf_event_context *ctx)
> +static void perf_ctx_enable(struct perf_event_context *ctx. bool cgroup)
>  {
>         struct perf_event_pmu_context *pmu_ctx;
>
> -       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> +       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +               if (cgroup && !pmu_ctx->nr_cgroups)
> +                       continue;
>                 perf_pmu_enable(pmu_ctx->pmu);
> +       }
>  }
>
>  static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
> @@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task)
>                 return;
>
>         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
> -       perf_ctx_disable(&cpuctx->ctx);
> +       perf_ctx_disable(&cpuctx->ctx, true);
>
> -       ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
> +       ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
>         /*
>          * must not be done before ctxswout due
>          * to update_cgrp_time_from_cpuctx() in
> @@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task)
>          * perf_cgroup_set_timestamp() in ctx_sched_in()
>          * to not have to pass task around
>          */
> -       ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
> +       ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
>
> -       perf_ctx_enable(&cpuctx->ctx);
> +       perf_ctx_enable(&cpuctx->ctx, true);
>         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>  }
>
> @@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
>         if (!is_cgroup_event(event))
>                 return;
>
> +       event->pmu_ctx->nr_cgroups++;
> +
>         /*
>          * Because cgroup events are always per-cpu events,
>          * @ctx == &cpuctx->ctx.
> @@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
>         if (!is_cgroup_event(event))
>                 return;
>
> +       event->pmu_ctx->nr_cgroups--;
> +
>         /*
>          * Because cgroup events are always per-cpu events,
>          * @ctx == &cpuctx->ctx.
> @@ -2677,9 +2688,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
>         event_type &= EVENT_ALL;
>
> -       perf_ctx_disable(&cpuctx->ctx);
> +       perf_ctx_disable(&cpuctx->ctx, false);
>         if (task_ctx) {
> -               perf_ctx_disable(task_ctx);
> +               perf_ctx_disable(task_ctx, false);
>                 task_ctx_sched_out(task_ctx, event_type);
>         }
>
> @@ -2697,9 +2708,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
>
>         perf_event_sched_in(cpuctx, task_ctx);
>
> -       perf_ctx_enable(&cpuctx->ctx);
> +       perf_ctx_enable(&cpuctx->ctx, false);
>         if (task_ctx)
> -               perf_ctx_enable(task_ctx);
> +               perf_ctx_enable(task_ctx, false);
>  }
>
>  void perf_pmu_resched(struct pmu *pmu)
> @@ -3244,6 +3255,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
>         struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>         struct perf_event_pmu_context *pmu_ctx;
>         int is_active = ctx->is_active;
> +       bool cgroup = event_type & EVENT_CGROUP;
> +
> +       event_type &= ~EVENT_CGROUP;
>
>         lockdep_assert_held(&ctx->lock);
>
> @@ -3290,8 +3304,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
>
>         is_active ^= ctx->is_active; /* changed bits */
>
> -       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
> +       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +               if (cgroup && !pmu_ctx->nr_cgroups)
> +                       continue;
>                 __pmu_ctx_sched_out(pmu_ctx, is_active);
> +       }
>  }
>
>  /*
> @@ -3482,7 +3499,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
>                 if (context_equiv(ctx, next_ctx)) {
>
> -                       perf_ctx_disable(ctx);
> +                       perf_ctx_disable(ctx, false);
>
>                         /* PMIs are disabled; ctx->nr_pending is stable. */
>                         if (local_read(&ctx->nr_pending) ||
> @@ -3502,7 +3519,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>                         perf_ctx_sched_task_cb(ctx, false);
>                         perf_event_swap_task_ctx_data(ctx, next_ctx);
>
> -                       perf_ctx_enable(ctx);
> +                       perf_ctx_enable(ctx, false);
>
>                         /*
>                          * RCU_INIT_POINTER here is safe because we've not
> @@ -3526,13 +3543,13 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
>
>         if (do_switch) {
>                 raw_spin_lock(&ctx->lock);
> -               perf_ctx_disable(ctx);
> +               perf_ctx_disable(ctx, false);
>
>  inside_switch:
>                 perf_ctx_sched_task_cb(ctx, false);
>                 task_ctx_sched_out(ctx, EVENT_ALL);
>
> -               perf_ctx_enable(ctx);
> +               perf_ctx_enable(ctx, false);
>                 raw_spin_unlock(&ctx->lock);
>         }
>  }
> @@ -3818,47 +3835,32 @@ static int merge_sched_in(struct perf_event *event, void *data)
>         return 0;
>  }
>
> -static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
> +static void pmu_groups_sched_in(struct perf_event_context *ctx,
> +                               struct perf_event_groups *groups,
> +                               struct pmu *pmu)
>  {
> -       struct perf_event_pmu_context *pmu_ctx;
>         int can_add_hw = 1;
> -
> -       if (pmu) {
> -               visit_groups_merge(ctx, &ctx->pinned_groups,
> -                                  smp_processor_id(), pmu,
> -                                  merge_sched_in, &can_add_hw);
> -       } else {
> -               list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> -                       can_add_hw = 1;
> -                       visit_groups_merge(ctx, &ctx->pinned_groups,
> -                                          smp_processor_id(), pmu_ctx->pmu,
> -                                          merge_sched_in, &can_add_hw);
> -               }
> -       }
> +       visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
> +                          merge_sched_in, &can_add_hw);
>  }
>
> -static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
> +static void ctx_groups_sched_in(struct perf_event_context *ctx,
> +                               struct perf_event_groups *groups,
> +                               bool cgroup)
>  {
>         struct perf_event_pmu_context *pmu_ctx;
> -       int can_add_hw = 1;
>
> -       if (pmu) {
> -               visit_groups_merge(ctx, &ctx->flexible_groups,
> -                                  smp_processor_id(), pmu,
> -                                  merge_sched_in, &can_add_hw);
> -       } else {
> -               list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> -                       can_add_hw = 1;
> -                       visit_groups_merge(ctx, &ctx->flexible_groups,
> -                                          smp_processor_id(), pmu_ctx->pmu,
> -                                          merge_sched_in, &can_add_hw);
> -               }
> +       list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
> +               if (cgroup && !pmu_ctx->nr_cgroups)
> +                       continue;
> +               pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
>         }
>  }
>
> -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
> +static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
> +                              struct pmu *pmu)
>  {
> -       ctx_flexible_sched_in(ctx, pmu);
> +       pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
>  }
>
>  static void
> @@ -3866,6 +3868,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
>  {
>         struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>         int is_active = ctx->is_active;
> +       bool cgroup = event_type & EVENT_CGROUP;
> +
> +       event_type &= ~EVENT_CGROUP;
>
>         lockdep_assert_held(&ctx->lock);
>
> @@ -3898,11 +3903,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
>          * in order to give them the best chance of going on.
>          */
>         if (is_active & EVENT_PINNED)
> -               ctx_pinned_sched_in(ctx, NULL);
> +               ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
>
>         /* Then walk through the lower prio flexible groups */
>         if (is_active & EVENT_FLEXIBLE)
> -               ctx_flexible_sched_in(ctx, NULL);
> +               ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
>  }
>
>  static void perf_event_context_sched_in(struct task_struct *task)
> @@ -3917,11 +3922,11 @@ static void perf_event_context_sched_in(struct task_struct *task)
>
>         if (cpuctx->task_ctx == ctx) {
>                 perf_ctx_lock(cpuctx, ctx);
> -               perf_ctx_disable(ctx);
> +               perf_ctx_disable(ctx, false);
>
>                 perf_ctx_sched_task_cb(ctx, true);
>
> -               perf_ctx_enable(ctx);
> +               perf_ctx_enable(ctx, false);
>                 perf_ctx_unlock(cpuctx, ctx);
>                 goto rcu_unlock;
>         }
> @@ -3934,7 +3939,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
>         if (!ctx->nr_events)
>                 goto unlock;
>
> -       perf_ctx_disable(ctx);
> +       perf_ctx_disable(ctx, false);
>         /*
>          * We want to keep the following priority order:
>          * cpu pinned (that don't need to move), task pinned,
> @@ -3944,7 +3949,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
>          * events, no need to flip the cpuctx's events around.
>          */
>         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
> -               perf_ctx_disable(&cpuctx->ctx);
> +               perf_ctx_disable(&cpuctx->ctx, false);
>                 ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
>         }
>
> @@ -3953,9 +3958,9 @@ static void perf_event_context_sched_in(struct task_struct *task)
>         perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
>
>         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
> -               perf_ctx_enable(&cpuctx->ctx);
> +               perf_ctx_enable(&cpuctx->ctx, false);
>
> -       perf_ctx_enable(ctx);
> +       perf_ctx_enable(ctx, false);
>
>  unlock:
>         perf_ctx_unlock(cpuctx, ctx);