On Thu, Sep 01, 2022 at 04:05:53PM +0530, Ravi Bangoria wrote: > On 29-Aug-22 8:10 PM, Peter Zijlstra wrote: > > On Mon, Aug 29, 2022 at 02:04:33PM +0200, Peter Zijlstra wrote: > >> On Mon, Aug 29, 2022 at 05:03:47PM +0530, Ravi Bangoria wrote: > >>> @@ -12598,6 +12590,7 @@ EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); > >>> > >>> void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) > >>> { > >>> +#if 0 // XXX buggered - cpu hotplug, who cares > >>> struct perf_event_context *src_ctx; > >>> struct perf_event_context *dst_ctx; > >>> struct perf_event *event, *tmp; > >>> @@ -12658,6 +12651,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) > >>> } > >>> mutex_unlock(&dst_ctx->mutex); > >>> mutex_unlock(&src_ctx->mutex); > >>> +#endif > >>> } > >>> EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); > >>> > >> > >> Note to self; fix this :-) I'll see if I have time for that later today. > > > > Urgh, while going through that it appears the whole refcounting thing > > isn't fully done either. > > Not sure I follow. Can you please elaborate. Sure, and sorry, I've not been feeling too well the past few days; I meant to post something, but I just couldn't find it in me to finish it let alone make it work. So the basic issue I mentioned is that: /* * ,------------------------[1:n]---------------------. * V V * perf_event_context <-[1:n]-> perf_event_pmu_context <--- perf_event * ^ ^ | | * `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-' * * * XXX destroy epc when empty * refcount, !rcu * * XXX epc locking * * event->pmu_ctx ctx->mutex && inactive * ctx->pmu_ctx_list ctx->mutex && ctx->lock * */ struct perf_event_pmu_context { ... atomic_t refcount; /* event <-> epc */ ... } But that then also suggests that: struct perf_event_context { ... refcount_t refcount; ... } should be: ctx <-> epc, and that is not so, the ctx::refcount still counts the number of events. Now this might not be bad, but it is confusing. Anywya; below is what I have, but it is entirely unfinished untested might eat your ganny etc.. --- Index: linux-2.6/include/linux/perf_event.h =================================================================== --- linux-2.6.orig/include/linux/perf_event.h +++ linux-2.6/include/linux/perf_event.h @@ -883,7 +883,7 @@ struct perf_event_context { int nr_freq; int rotate_disable; - refcount_t refcount; + refcount_t refcount; /* ctx <-> epc */ struct task_struct *task; /* Index: linux-2.6/kernel/events/core.c =================================================================== --- linux-2.6.orig/kernel/events/core.c +++ linux-2.6/kernel/events/core.c @@ -2750,7 +2750,7 @@ perf_install_in_context(struct perf_even WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) - event->cpu = cpu; + WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx @@ -4764,6 +4764,7 @@ find_get_pmu_context(struct pmu *pmu, st list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; raw_spin_unlock_irq(&ctx->lock); + // XXX get_ctx(ctx); } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); @@ -4800,6 +4801,7 @@ find_get_pmu_context(struct pmu *pmu, st list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; + // XXX get_ctx(ctx) found_epc: if (task_ctx_data && !epc->task_ctx_data) { @@ -4837,6 +4839,8 @@ static void put_pmu_ctx(struct perf_even list_del_init(&epc->pmu_ctx_entry); epc->ctx = NULL; raw_spin_unlock_irqrestore(&ctx->lock, flags); + + // XXX put_ctx(ctx) } WARN_ON_ONCE(!list_empty(&epc->pinned_active)); @@ -12444,6 +12448,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); + + // XXX put_ctx(ctx); if (task) { up_read(&task->signal->exec_update_lock); @@ -12588,34 +12594,44 @@ err: } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +static void __perf_pmu_remove(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, + struct perf_event_groups *groups, + struct list_head *events) { -#if 0 // XXX buggered - cpu hotplug, who cares - struct perf_event_context *src_ctx; - struct perf_event_context *dst_ctx; - struct perf_event *event, *tmp; - LIST_HEAD(events); + struct perf_event *event; - src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; - dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + for (event = perf_event_groups_first(groups, cpu, pmu, NULL); + event; + event = perf_event_groups_next(event, pmu)) { - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ - mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &src_ctx->event_list, - event_entry) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, src_cpu); - put_ctx(src_ctx); - list_add(&event->migrate_entry, &events); + unaccount_event_cpu(event, cpu); + put_pmu_ctx(event->pmu_ctx); + list_add(&event->migrate_event, events); } +} - /* - * Wait for the events to quiesce before re-instating them. - */ - synchronize_rcu(); +static void __perf_pmu_install_event(struct pmu *pmu, + struct perf_event_context *ctx, + int cpu, struct perf_event *event) +{ + struct perf_event_pmu_context *epc; + + event->cpu = cpu; + epc = find_get_pmu_context(pmu, ctx, event); + event->pmu_ctx = epc; + + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, cpu); + perf_install_in_context(ctx, event, cpu); +} + +static void __perf_pmu_install(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, struct list_head *events) +{ + struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. @@ -12625,33 +12641,50 @@ void perf_pmu_migrate_context(struct pmu * leader will enable its siblings, even if those are still on the old * context. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } +} + +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx, *dst_ctx; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(&cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(&cpu_context, dst_cpu)->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); + + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_src->pinned_groups, &events); + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_src->flexible_groups, &events); + + /* + * Wait for the events to quiesce before re-instating them. + */ + synchronize_rcu(); + + __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); + mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); -#endif } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);