On Tue, Mar 23, 2021 at 11:41AM +0100, Marco Elver wrote: > On Tue, 23 Mar 2021 at 11:32, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote: [...] > > > + if (parent_event) { > > > /* > > > + * Remove event from parent, to avoid race where the > > > + * parent concurrently iterates through its children to > > > + * enable, disable, or otherwise modify an event. > > > */ > > > + mutex_lock(&parent_event->child_mutex); > > > + list_del_init(&event->child_list); > > > + mutex_unlock(&parent_event->child_mutex); > > > } > > > > ^^^ this, right? > > > > But that's something perf_event_exit_event() alread does. So then you're > > worried about the order of things. > > Correct. We somehow need to prohibit the parent from doing an > event_function_call() while we potentially deactivate the context with > perf_remove_from_context(). > > > > + > > > + perf_remove_from_context(event, !!event->parent * DETACH_GROUP); > > > + perf_event_exit_event(event, ctx, current, true); > > > } > > > > perf_event_release_kernel() first does perf_remove_from_context() and > > then clears the child_list, and that makes sense because if we're there, > > there's no external access anymore, the filedesc is gone and nobody will > > be iterating child_list anymore. > > > > perf_event_exit_task_context() and perf_event_exit_event() OTOH seem to > > rely on ctx->task == TOMBSTONE to sabotage event_function_call() such > > that if anybody is iterating the child_list, it'll NOP out. > > > > But here we don't have neither, and thus need to worry about the order > > vs child_list iteration. > > > > I suppose we should stick sync_child_event() in there as well. > > > > And at that point there's very little value in still using > > perf_event_exit_event()... let me see if there's something to be done > > about that. > > I don't mind dropping use of perf_event_exit_event() and open coding > all of this. That would also avoid modifying perf_event_exit_event(). > > But I leave it to you what you think is nicest. I played a bit more with it, and the below would be the version without using perf_event_exit_event(). Perhaps it isn't too bad. Thanks, -- Marco ------ >8 ------ diff --git a/kernel/events/core.c b/kernel/events/core.c index aa47e111435e..288b61820dab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2165,8 +2165,9 @@ static void perf_group_detach(struct perf_event *event) * If this is a sibling, remove it from its group. */ if (leader != event) { + leader->nr_siblings--; list_del_init(&event->sibling_list); - event->group_leader->nr_siblings--; + event->group_leader = event; goto out; } @@ -2180,8 +2181,9 @@ static void perf_group_detach(struct perf_event *event) if (sibling->event_caps & PERF_EV_CAP_SIBLING) perf_remove_sibling_event(sibling); - sibling->group_leader = sibling; + leader->nr_siblings--; list_del_init(&sibling->sibling_list); + sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; @@ -2358,10 +2360,19 @@ __perf_remove_from_context(struct perf_event *event, static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { struct perf_event_context *ctx = event->ctx; + bool remove; lockdep_assert_held(&ctx->mutex); - event_function_call(event, __perf_remove_from_context, (void *)flags); + /* + * There is concurrency vs remove_on_exec(). + */ + raw_spin_lock_irq(&ctx->lock); + remove = (event->attach_state & PERF_ATTACH_CONTEXT); + raw_spin_unlock_irq(&ctx->lock); + + if (remove) + event_function_call(event, __perf_remove_from_context, (void *)flags); /* * The above event_function_call() can NO-OP when it hits @@ -4196,43 +4207,86 @@ static void perf_event_enable_on_exec(int ctxn) } static void perf_remove_from_owner(struct perf_event *event); -static void perf_event_exit_event(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child); +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child); +static void free_event(struct perf_event *event); /* * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events. */ -static void perf_event_remove_on_exec(void) +static void perf_event_remove_on_exec(int ctxn) { - int ctxn; + struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event *event, *next; + LIST_HEAD(free_list); + unsigned long flags; + bool modified = false; - for_each_task_context_nr(ctxn) { - struct perf_event_context *ctx; - struct perf_event *event, *next; + ctx = perf_pin_task_context(current, ctxn); + if (!ctx) + return; - ctx = perf_pin_task_context(current, ctxn); - if (!ctx) + mutex_lock(&ctx->mutex); + + if (WARN_ON_ONCE(ctx->task != current)) + goto unlock; + + list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { + struct perf_event *parent_event = event->parent; + + if (!event->attr.remove_on_exec) continue; - mutex_lock(&ctx->mutex); - list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { - if (!event->attr.remove_on_exec) - continue; + if (!is_kernel_event(event)) + perf_remove_from_owner(event); + + modified = true; - if (!is_kernel_event(event)) - perf_remove_from_owner(event); - perf_remove_from_context(event, DETACH_GROUP); + if (parent_event) { /* - * Remove the event and feed back its values to the - * parent event. + * Remove event from parent *before* modifying contexts, + * to avoid race where the parent concurrently iterates + * through its children to enable, disable, or otherwise + * modify an event. */ - perf_event_exit_event(event, ctx, current); + + sync_child_event(event, current); + + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent_event->child_mutex); + + perf_event_wakeup(parent_event); + put_event(parent_event); } - mutex_unlock(&ctx->mutex); - put_ctx(ctx); + + perf_remove_from_context(event, !!event->parent * DETACH_GROUP); + + raw_spin_lock_irq(&ctx->lock); + WARN_ON_ONCE(ctx->is_active); + perf_event_set_state(event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ + raw_spin_unlock_irq(&ctx->lock); + + if (parent_event) + free_event(event); + else + perf_event_wakeup(event); } + + raw_spin_lock_irqsave(&ctx->lock, flags); + if (modified) + clone_ctx = unclone_ctx(ctx); + --ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + +unlock: + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); + if (clone_ctx) + put_ctx(clone_ctx); } struct perf_read_data { @@ -7581,20 +7635,18 @@ void perf_event_exec(void) struct perf_event_context *ctx; int ctxn; - rcu_read_lock(); for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - perf_event_enable_on_exec(ctxn); + perf_event_remove_on_exec(ctxn); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, - true); + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, + NULL, true); + } + rcu_read_unlock(); } - rcu_read_unlock(); - - perf_event_remove_on_exec(); } struct remote_output {