On 11-Oct-22 11:17 PM, Peter Zijlstra wrote: > On Tue, Oct 11, 2022 at 04:02:56PM +0200, Peter Zijlstra wrote: >> On Tue, Oct 11, 2022 at 06:49:55PM +0530, Ravi Bangoria wrote: >>> On 11-Oct-22 4:59 PM, Peter Zijlstra wrote: >>>> On Sat, Oct 08, 2022 at 11:54:24AM +0530, Ravi Bangoria wrote: >>>> >>>>> +static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, >>>>> + struct perf_event_context *next_ctx) >>>>> +{ >>>>> + struct perf_event_pmu_context *prev_epc, *next_epc; >>>>> + >>>>> + if (!prev_ctx->nr_task_data) >>>>> + return; >>>>> + >>>>> + prev_epc = list_first_entry(&prev_ctx->pmu_ctx_list, >>>>> + struct perf_event_pmu_context, >>>>> + pmu_ctx_entry); >>>>> + next_epc = list_first_entry(&next_ctx->pmu_ctx_list, >>>>> + struct perf_event_pmu_context, >>>>> + pmu_ctx_entry); >>>>> + >>>>> + while (&prev_epc->pmu_ctx_entry != &prev_ctx->pmu_ctx_list && >>>>> + &next_epc->pmu_ctx_entry != &next_ctx->pmu_ctx_list) { >>>>> + >>>>> + WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu); >>>>> + >>>>> + /* >>>>> + * PMU specific parts of task perf context can require >>>>> + * additional synchronization. As an example of such >>>>> + * synchronization see implementation details of Intel >>>>> + * LBR call stack data profiling; >>>>> + */ >>>>> + if (prev_epc->pmu->swap_task_ctx) >>>>> + prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); >>>>> + else >>>>> + swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); >>>> >>>> Did I forget to advance the iterators here? >>> >>> Yeah. Seems so. I overlooked it too. >> >> OK; so I'm not slowly going crazy staring at this code ;-) Let me go add >> it now then. :-) >> >> But first I gotta taxi the kids around for a bit, bbl. > > OK, so I've been going over the perf_event_pmu_context life-time thing > as well, there were a bunch of XXXs there and I'm not sure Im happy with > things, but I'd also forgotten most of it. > > Ideally epc works like it's a regular member of ctx -- locking wise that > is, but I'm not sure we can make that stick -- see the ctx->mutex issues > we have with put_ctx(). > > As such, I'm going to have to re-audit all the epc usage to see if > pure ctx->lock is sufficient. > > I did do make epc RCU freed, because pretty much everything is and that > was easy enough to make happen -- it means we don't need to worry about > that. > > But I'm going cross-eyes from staring at this all day, so more tomorrow. > The below is what I currently have. > > --- > --- a/include/linux/perf_event.h > +++ b/include/linux/perf_event.h > @@ -833,13 +833,13 @@ struct perf_event { > * `--------[1:n]---------' `-[n:1]-> pmu <-[1:n]-' > * > * > - * XXX destroy epc when empty > - * refcount, !rcu > + * epc lifetime is refcount based and RCU freed (similar to perf_event_context). > + * epc locking is as if it were a member of perf_event_context; specifically: > * > - * XXX epc locking > + * modification, both: ctx->mutex && ctx->lock > + * reading, either: ctx->mutex || ctx->lock > * > - * event->pmu_ctx ctx->mutex && inactive > - * ctx->pmu_ctx_list ctx->mutex && ctx->lock > + * XXX except this isn't true ... see put_pmu_ctx(). > * > */ > struct perf_event_pmu_context { > @@ -857,6 +857,7 @@ struct perf_event_pmu_context { > unsigned int nr_events; > > atomic_t refcount; /* event <-> epc */ > + struct rcu_head rcu_head; > > void *task_ctx_data; /* pmu specific data */ > /* > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -1727,6 +1727,10 @@ perf_event_groups_next(struct perf_event > return NULL; > } > > +#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ > + for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ > + event; event = perf_event_groups_next(event, pmu)) > + > /* > * Iterate through the whole groups tree. > */ > @@ -3366,6 +3370,14 @@ static void perf_event_sync_stat(struct > } > } > > +#define list_for_each_entry_double(pos1, pos2, head1, head2, member) \ > + for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ > + pos2 = list_first_entry(head2, typeof(*pos2), member); \ > + !list_entry_is_head(pos1, head1, member) && \ > + !list_entry_is_head(pos2, head2, member); \ > + pos1 = list_next_entry(pos1, member), \ > + pos2 = list_next_entry(pos2, member)) > + > static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, > struct perf_event_context *next_ctx) > { > @@ -3374,16 +3386,9 @@ static void perf_event_swap_task_ctx_dat > if (!prev_ctx->nr_task_data) > return; > > - prev_epc = list_first_entry(&prev_ctx->pmu_ctx_list, > - struct perf_event_pmu_context, > - pmu_ctx_entry); > - next_epc = list_first_entry(&next_ctx->pmu_ctx_list, > - struct perf_event_pmu_context, > - pmu_ctx_entry); > - > - while (&prev_epc->pmu_ctx_entry != &prev_ctx->pmu_ctx_list && > - &next_epc->pmu_ctx_entry != &next_ctx->pmu_ctx_list) { > - > + list_for_each_entry_double(prev_epc, next_epc, > + &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, > + pmu_ctx_entry) { There are more places which can use list_for_each_entry_double(). I'll fix those. > WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu); > > /* > @@ -3706,7 +3711,6 @@ static noinline int visit_groups_merge(s > perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); > } > > - > min_heapify_all(&event_heap, &perf_min_heap); > > while (event_heap.nr) { > @@ -3845,7 +3849,6 @@ ctx_sched_in(struct perf_event_context * > /* start ctx time */ > __update_context_time(ctx, false); > perf_cgroup_set_timestamp(cpuctx); > - // XXX ctx->task =? task > /* > * CPU-release for the below ->is_active store, > * see __load_acquire() in perf_event_time_now() > @@ -4815,6 +4818,15 @@ find_get_pmu_context(struct pmu *pmu, st > > __perf_init_event_pmu_context(new, pmu); > > + /* > + * XXX > + * > + * lockdep_assert_held(&ctx->mutex); > + * > + * can't because perf_event_init_task() doesn't actually hold the > + * child_ctx->mutex. > + */ > + > raw_spin_lock_irq(&ctx->lock); > list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { > if (epc->pmu == pmu) { > @@ -4849,6 +4861,14 @@ static void get_pmu_ctx(struct perf_even > WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); > } > > +static void free_epc_rcu(struct rcu_head *head) > +{ > + struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); > + > + kfree(epc->task_ctx_data); > + kfree(epc); > +} > + > static void put_pmu_ctx(struct perf_event_pmu_context *epc) > { > unsigned long flags; > @@ -4859,7 +4879,14 @@ static void put_pmu_ctx(struct perf_even > if (epc->ctx) { > struct perf_event_context *ctx = epc->ctx; > > - // XXX ctx->mutex > + /* > + * XXX > + * > + * lockdep_assert_held(&ctx->mutex); > + * > + * can't because of the call-site in _free_event()/put_event() > + * which isn't always called under ctx->mutex. > + */ Yes. I came across the same and could not figure out how to solve this. So Just kept XXX as is. > > WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); > raw_spin_lock_irqsave(&ctx->lock, flags); > @@ -4874,17 +4901,15 @@ static void put_pmu_ctx(struct perf_even > if (epc->embedded) > return; > > - kfree(epc->task_ctx_data); > - kfree(epc); > + call_rcu(&epc->rcu_head, free_epc_rcu); > } > > static void perf_event_free_filter(struct perf_event *event); > > static void free_event_rcu(struct rcu_head *head) > { > - struct perf_event *event; > + struct perf_event *event = container_of(head, typeof(*event), rcu_head); > > - event = container_of(head, struct perf_event, rcu_head); > if (event->ns) > put_pid_ns(event->ns); > perf_event_free_filter(event); > @@ -12643,13 +12668,6 @@ perf_event_create_kernel_counter(struct > goto err_alloc; > } > > - pmu_ctx = find_get_pmu_context(pmu, ctx, event); > - if (IS_ERR(pmu_ctx)) { > - err = PTR_ERR(pmu_ctx); > - goto err_ctx; > - } > - event->pmu_ctx = pmu_ctx; > - > WARN_ON_ONCE(ctx->parent_ctx); > mutex_lock(&ctx->mutex); > if (ctx->task == TASK_TOMBSTONE) { > @@ -12657,6 +12675,13 @@ perf_event_create_kernel_counter(struct > goto err_unlock; > } > > + pmu_ctx = find_get_pmu_context(pmu, ctx, event); > + if (IS_ERR(pmu_ctx)) { > + err = PTR_ERR(pmu_ctx); > + goto err_unlock; > + } > + event->pmu_ctx = pmu_ctx; We should call find_get_pmu_context() with ctx->mutex held and thus above perf_event_create_kernel_counter() change. Is my understanding correct? > + > if (!task) { > /* > * Check if the @cpu we're creating an event for is online. > @@ -12668,13 +12693,13 @@ perf_event_create_kernel_counter(struct > container_of(ctx, struct perf_cpu_context, ctx); > if (!cpuctx->online) { > err = -ENODEV; > - goto err_unlock; > + goto err_pmu_ctx; > } > } > > if (!exclusive_event_installable(event, ctx)) { > err = -EBUSY; > - goto err_unlock; > + goto err_pmu_ctx; > } > > perf_install_in_context(ctx, event, event->cpu); > @@ -12683,9 +12708,10 @@ perf_event_create_kernel_counter(struct > > return event; > > +err_pmu_ctx: > + put_pmu_ctx(pmu_ctx); > err_unlock: > mutex_unlock(&ctx->mutex); > -err_ctx: > perf_unpin_context(ctx); > put_ctx(ctx); > err_alloc: > @@ -12702,9 +12728,7 @@ static void __perf_pmu_remove(struct per > { > struct perf_event *event, *sibling; > > - for (event = perf_event_groups_first(groups, cpu, pmu, NULL); > - event; event = perf_event_groups_next(event, pmu)) { > - > + perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { > perf_remove_from_context(event, 0); > unaccount_event_cpu(event, cpu); > put_pmu_ctx(event->pmu_ctx); > @@ -12998,7 +13022,7 @@ void perf_event_free_task(struct task_st > struct perf_event_context *ctx; > struct perf_event *event, *tmp; > > - ctx = rcu_dereference(task->perf_event_ctxp); > + ctx = rcu_access_pointer(task->perf_event_ctxp); We dereference ctx pointer but with mutex and lock held. And thus rcu_access_pointer() is sufficient. Is my understanding correct? Thanks, Ravi