On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote: > Let me ponder that a little bit. So I did the thing on top of the get/put thing that would allow you to get rid of the ->closed thing, and before I was finished I already hated all of it :-( The thing is, if you're going to the effort of adding get/put and putting the responsibility on the implementation, then the ->closed thing is only a little extra ask. So... I wondered, how hard it would be for perf_pmu_unregister() to do what you want. Time passed, hacks were done... and while what I have is still riddled with holes (more work is definitely required), it does pass your dummy_pmu test scipt. I'll poke at this a little longer. Afaict it should be possible to make this good enough for what you want. Just needs more holes filled. --- include/linux/perf_event.h | 13 ++- kernel/events/core.c | 260 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 228 insertions(+), 45 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fb908843f209..20995d33d27e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -318,6 +318,9 @@ struct perf_output_handle; struct pmu { struct list_head entry; + spinlock_t events_lock; + struct list_head events; + struct module *module; struct device *dev; struct device *parent; @@ -612,9 +615,10 @@ struct perf_addr_filter_range { * enum perf_event_state - the states of an event: */ enum perf_event_state { - PERF_EVENT_STATE_DEAD = -4, - PERF_EVENT_STATE_EXIT = -3, - PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_DEAD = -5, + PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */ + PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */ + PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */ PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, @@ -853,6 +857,7 @@ struct perf_event { void *security; #endif struct list_head sb_list; + struct list_head pmu_list; /* * Certain events gets forwarded to another pmu internally by over- @@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); -extern void perf_pmu_unregister(struct pmu *pmu); +extern int perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); diff --git a/kernel/events/core.c b/kernel/events/core.c index cdd09769e6c5..e66827367a97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL -#define DETACH_DEAD 0x04UL +#define DETACH_EXIT 0x04UL +#define DETACH_REVOKE 0x08UL +#define DETACH_DEAD 0x10UL /* * Cross CPU call to remove a performance event @@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; + enum perf_event_state state = PERF_EVENT_STATE_OFF; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); @@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event, * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ - if (flags & DETACH_DEAD) + if (flags & DETACH_EXIT) + state = PERF_EVENT_STATE_EXIT; + if (flags & DETACH_REVOKE) + state = PERF_EVENT_STATE_REVOKED; + if (flags & DETACH_DEAD) { event->pending_disable = 1; + state = PERF_EVENT_STATE_DEAD; + } event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); - if (flags & DETACH_DEAD) - event->state = PERF_EVENT_STATE_DEAD; + + event->state = state; if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; @@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) static void perf_remove_from_owner(struct perf_event *event); static void perf_event_exit_event(struct perf_event *event, - struct perf_event_context *ctx); + struct perf_event_context *ctx, + bool revoke); /* * Removes all events from the current task that have been marked @@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx) modified = true; - perf_event_exit_event(event, ctx); + perf_event_exit_event(event, ctx, false); } raw_spin_lock_irqsave(&ctx->lock, flags); @@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event) attr->context_switch || attr->text_poke || attr->bpf_event) return true; + return false; } @@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event *event) static void _free_event(struct perf_event *event) { + struct pmu *pmu = event->pmu; + irq_work_sync(&event->pending_irq); irq_work_sync(&event->pending_disable_irq); perf_pending_task_sync(event); @@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event) security_perf_event_free(event); if (event->rb) { + WARN_ON_ONCE(!pmu); /* * Can happen when we close an event with re-directed output. * @@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event) put_callchain_buffers(); } - perf_event_free_bpf_prog(event); - perf_addr_filters_splice(event, NULL); - kfree(event->addr_filter_ranges); + if (pmu) { + perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filter_ranges); + } - if (event->destroy) + if (event->destroy) { + WARN_ON_ONCE(!pmu); event->destroy(event); + } /* * Must be after ->destroy(), due to uprobe_perf_close() using @@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); - if (event->pmu_ctx) + if (event->pmu_ctx) { + WARN_ON_ONCE(!pmu); put_pmu_ctx(event->pmu_ctx); + } /* * perf_event_free_task() relies on put_ctx() being 'last', in particular @@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - exclusive_event_destroy(event); - module_put(event->pmu->module); + if (pmu) { + exclusive_event_destroy(event); + module_put(pmu->module); + scoped_guard(spinlock, &pmu->events_lock) { + list_del(&event->pmu_list); + wake_up_var(pmu); + } + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + if (event->state > PERF_EVENT_STATE_REVOKED) { + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); + } else { + event->state = PERF_EVENT_STATE_DEAD; + } perf_event_ctx_unlock(event, ctx); @@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count) * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ - if (event->state == PERF_EVENT_STATE_ERROR) + if (event->state <= PERF_EVENT_STATE_ERROR) return 0; if (count < event->read_size) @@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) struct perf_buffer *rb; __poll_t events = EPOLLHUP; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + poll_wait(file, &event->waitq, wait); + if (event->state <= PERF_EVENT_STATE_REVOKED) + return EPOLLERR; + if (is_event_hup(event)) return events; @@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p) } static int perf_event_set_output(struct perf_event *event, - struct perf_event *output_event); + struct perf_event *output_event, bool force); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static int perf_copy_attr(struct perf_event_attr __user *uattr, struct perf_event_attr *attr); @@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon void (*func)(struct perf_event *); u32 flags = arg; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + switch (cmd) { case PERF_EVENT_IOC_ENABLE: func = _perf_event_enable; @@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon if (ret) return ret; output_event = fd_file(output)->private_data; - ret = perf_event_set_output(event, output_event); + ret = perf_event_set_output(event, output_event, false); fdput(output); } else { - ret = perf_event_set_output(event, NULL); + ret = perf_event_set_output(event, NULL, false); } return ret; } @@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); bool detach_rest = false; + /* FIXIES vs perf_pmu_unregister() */ if (event->pmu->event_unmapped) event->pmu->event_unmapped(event, vma->vm_mm); @@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long vma_size; unsigned long nr_pages; long user_extra = 0, extra = 0; - int ret = 0, flags = 0; + int ret, flags = 0; + + ret = security_perf_event_read(event); + if (ret) + return ret; + + /* XXX needs event->mmap_mutex? */ + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; /* * Don't allow mmap() of inherited per-task counters. This would @@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; - ret = security_perf_event_read(event); - if (ret) - return ret; - vma_size = vma->vm_end - vma->vm_start; if (vma->vm_pgoff == 0) { @@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; + if (event->state <= PERF_EVENT_STATE_REVOKED) + return -ENODEV; + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); inode_unlock(inode); @@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event *event) static void free_pmu_context(struct pmu *pmu) { + free_percpu(pmu->pmu_disable_count); free_percpu(pmu->cpu_pmu_context); } @@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (type >= 0) max = type; + // XXX broken vs perf_init_event(), this publishes before @pmu is finalized ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL); if (ret < 0) goto free_pdc; @@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; - list_add_rcu(&pmu->entry, &pmus); atomic_set(&pmu->exclusive_cnt, 0); + INIT_LIST_HEAD(&pmu->events); + spin_lock_init(&pmu->events_lock); + + /* + * Publish the pmu after it is fully initialized. + */ + list_add_rcu(&pmu->entry, &pmus); ret = 0; unlock: mutex_unlock(&pmus_lock); @@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } EXPORT_SYMBOL_GPL(perf_pmu_register); -void perf_pmu_unregister(struct pmu *pmu) +static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event, + struct perf_event_context *ctx) { + /* + * De-schedule the event and mark it EXIT_PMU. + */ + perf_event_exit_event(event, ctx, true); + + /* + * All _free_event() bits that rely on event->pmu: + */ + perf_event_set_output(event, NULL, true); + + perf_event_free_bpf_prog(event); + perf_addr_filters_splice(event, NULL); + kfree(event->addr_filter_ranges); + + if (event->destroy) { + event->destroy(event); + event->destroy = NULL; + } + + if (event->pmu_ctx) { + /* + * Make sure pmu->cpu_pmu_context is unused. An alternative is + * to have it be pointers and make put_pmu_ctx()'s + * epc->embedded case be another RCU free case. + */ + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; + } + + exclusive_event_destroy(event); + module_put(pmu->module); + + event->pmu = NULL; /* force fault instead of UAF */ +} + +static void pmu_detach_event(struct pmu *pmu, struct perf_event *event) +{ + struct perf_event_context *ctx; + + ctx = perf_event_ctx_lock(event); + __pmu_detach_event(pmu, event, ctx); + perf_event_ctx_unlock(event, ctx); + + scoped_guard(spinlock, &pmu->events_lock) + list_del(&event->pmu_list); +} + +static struct perf_event *pmu_get_event(struct pmu *pmu) +{ + struct perf_event *event; + + guard(spinlock)(&pmu->events_lock); + list_for_each_entry(event, &pmu->events, pmu_list) { + if (atomic_long_inc_not_zero(&event->refcount)) + return event; + } + + return NULL; +} + +static bool pmu_empty(struct pmu *pmu) +{ + guard(spinlock)(&pmu->events_lock); + return list_empty(&pmu->events); +} + +static void pmu_detach_events(struct pmu *pmu) +{ + struct perf_event *event; + + for (;;) { + event = pmu_get_event(pmu); + if (!event) + break; + + pmu_detach_event(pmu, event); + put_event(event); + } + + /* + * wait for pending _free_event()s + */ + wait_var_event(pmu, pmu_empty(pmu)); +} + +int perf_pmu_unregister(struct pmu *pmu) +{ + /* + * FIXME! + * + * perf_mmap_close(); event->pmu->event_unmapped() + * + * XXX this check is racy vs perf_event_alloc() + */ + if (pmu->event_unmapped && !pmu_empty(pmu)) + return -EBUSY; + mutex_lock(&pmus_lock); list_del_rcu(&pmu->entry); + idr_remove(&pmu_idr, pmu->type); + mutex_unlock(&pmus_lock); /* * We dereference the pmu list under both SRCU and regular RCU, so @@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_srcu(&pmus_srcu); synchronize_rcu(); - free_percpu(pmu->pmu_disable_count); - idr_remove(&pmu_idr, pmu->type); + /* + * PMU is removed from the pmus list, so no new events will + * be created, now take care of the existing ones. + */ + pmu_detach_events(pmu); + + /* + * PMU is unused, make it go away. + */ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) { if (pmu->nr_addr_filters) device_remove_file(pmu->dev, &dev_attr_nr_addr_filters); device_del(pmu->dev); put_device(pmu->dev); } + + /* + * XXX -- broken? can still contain SW events at this point? + * audit more, make sure DETACH_GROUP DTRT + */ free_pmu_context(pmu); - mutex_unlock(&pmus_lock); + + return 0; } EXPORT_SYMBOL_GPL(perf_pmu_unregister); @@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, /* symmetric to unaccount_event() in _free_event() */ account_event(event); + scoped_guard(spinlock, &pmu->events_lock) + list_add(&event->pmu_list, &pmu->events); + return event; err_callchain_buffer: @@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) } static int -perf_event_set_output(struct perf_event *event, struct perf_event *output_event) +perf_event_set_output(struct perf_event *event, struct perf_event *output_event, bool force) { struct perf_buffer *rb = NULL; int ret = -EINVAL; @@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) - goto unlock; + if (atomic_read(&event->mmap_count)) { + if (!force) + goto unlock; + + WARN_ON_ONCE(event->pmu->event_unmapped); + } if (output_event) { /* get the rb we want to redirect to */ @@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open, if (err) goto err_fd; group_leader = fd_file(group)->private_data; + if (group_leader->state <= PERF_EVENT_STATE_REVOKED) { + err = -ENODEV; + goto err_group_fd; + } if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open, event->pmu_ctx = pmu_ctx; if (output_event) { - err = perf_event_set_output(event, output_event); + err = perf_event_set_output(event, output_event, false); if (err) goto err_context; } @@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event *child_event) } static void -perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) +perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx, bool revoke) { struct perf_event *parent_event = event->parent; - unsigned long detach_flags = 0; + unsigned long detach_flags = DETACH_EXIT; if (parent_event) { /* @@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - detach_flags = DETACH_GROUP | DETACH_CHILD; + detach_flags |= DETACH_GROUP | DETACH_CHILD; mutex_lock(&parent_event->child_mutex); } - perf_remove_from_context(event, detach_flags); + if (revoke) + detach_flags |= DETACH_GROUP | DETACH_REVOKE; - raw_spin_lock_irq(&ctx->lock); - if (event->state > PERF_EVENT_STATE_EXIT) - perf_event_set_state(event, PERF_EVENT_STATE_EXIT); - raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event, detach_flags); /* * Child events can be freed. @@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct task_struct *child) perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx); + perf_event_exit_event(child_event, child_ctx, false); mutex_unlock(&child_ctx->mutex);