On Wed, Oct 16, 2024 at 02:03:02PM +0200, Peter Zijlstra wrote:
On Mon, Oct 14, 2024 at 09:25:19PM +0200, Peter Zijlstra wrote:
Let me ponder that a little bit.
So I did the thing on top of the get/put thing that would allow you to
get rid of the ->closed thing, and before I was finished I already hated
all of it :-(
The thing is, if you're going to the effort of adding get/put and
putting the responsibility on the implementation, then the ->closed
thing is only a little extra ask.
So... I wondered, how hard it would be for perf_pmu_unregister() to do
what you want.
Time passed, hacks were done...
and while what I have is still riddled with holes (more work is
definitely required), it does pass your dummy_pmu test scipt.
I'll poke at this a little longer. Afaict it should be possible to make
this good enough for what you want. Just needs more holes filled.
---
include/linux/perf_event.h | 13 ++-
kernel/events/core.c | 260 ++++++++++++++++++++++++++++++++++++++-------
2 files changed, 228 insertions(+), 45 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fb908843f209..20995d33d27e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -318,6 +318,9 @@ struct perf_output_handle;
struct pmu {
struct list_head entry;
+ spinlock_t events_lock;
+ struct list_head events;
oh... I looked at several lists and was wondering which one would
contain the events of our pmu. Now I see we didn't have that :)
+
struct module *module;
struct device *dev;
struct device *parent;
@@ -612,9 +615,10 @@ struct perf_addr_filter_range {
* enum perf_event_state - the states of an event:
*/
enum perf_event_state {
- PERF_EVENT_STATE_DEAD = -4,
- PERF_EVENT_STATE_EXIT = -3,
- PERF_EVENT_STATE_ERROR = -2,
+ PERF_EVENT_STATE_DEAD = -5,
+ PERF_EVENT_STATE_REVOKED = -4, /* pmu gone, must not touch */
+ PERF_EVENT_STATE_EXIT = -3, /* task died, still inherit */
+ PERF_EVENT_STATE_ERROR = -2, /* scheduling error, can enable */
PERF_EVENT_STATE_OFF = -1,
PERF_EVENT_STATE_INACTIVE = 0,
PERF_EVENT_STATE_ACTIVE = 1,
@@ -853,6 +857,7 @@ struct perf_event {
void *security;
#endif
struct list_head sb_list;
+ struct list_head pmu_list;
/*
* Certain events gets forwarded to another pmu internally by over-
@@ -1103,7 +1108,7 @@ extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern void perf_event_itrace_started(struct perf_event *event);
extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-extern void perf_pmu_unregister(struct pmu *pmu);
+extern int perf_pmu_unregister(struct pmu *pmu);
extern void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index cdd09769e6c5..e66827367a97 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2406,7 +2406,9 @@ ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
-#define DETACH_DEAD 0x04UL
+#define DETACH_EXIT 0x04UL
+#define DETACH_REVOKE 0x08UL
+#define DETACH_DEAD 0x10UL
/*
* Cross CPU call to remove a performance event
@@ -2421,6 +2423,7 @@ __perf_remove_from_context(struct perf_event *event,
void *info)
{
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
+ enum perf_event_state state = PERF_EVENT_STATE_OFF;
unsigned long flags = (unsigned long)info;
ctx_time_update(cpuctx, ctx);
@@ -2429,16 +2432,22 @@ __perf_remove_from_context(struct perf_event *event,
* Ensure event_sched_out() switches to OFF, at the very least
* this avoids raising perf_pending_task() at this time.
*/
- if (flags & DETACH_DEAD)
+ if (flags & DETACH_EXIT)
+ state = PERF_EVENT_STATE_EXIT;
+ if (flags & DETACH_REVOKE)
+ state = PERF_EVENT_STATE_REVOKED;
+ if (flags & DETACH_DEAD) {
event->pending_disable = 1;
+ state = PERF_EVENT_STATE_DEAD;
+ }
event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
- if (flags & DETACH_DEAD)
- event->state = PERF_EVENT_STATE_DEAD;
+
+ event->state = state;
if (!pmu_ctx->nr_events) {
pmu_ctx->rotate_necessary = 0;
@@ -4508,7 +4517,8 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
- struct perf_event_context *ctx);
+ struct perf_event_context *ctx,
+ bool revoke);
/*
* Removes all events from the current task that have been marked
@@ -4535,7 +4545,7 @@ static void perf_event_remove_on_exec(struct perf_event_context *ctx)
modified = true;
- perf_event_exit_event(event, ctx);
+ perf_event_exit_event(event, ctx, false);
}
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -5121,6 +5131,7 @@ static bool is_sb_event(struct perf_event *event)
attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
+
return false;
}
@@ -5321,6 +5332,8 @@ static void perf_pending_task_sync(struct perf_event *event)
static void _free_event(struct perf_event *event)
{
+ struct pmu *pmu = event->pmu;
+
irq_work_sync(&event->pending_irq);
irq_work_sync(&event->pending_disable_irq);
perf_pending_task_sync(event);
@@ -5330,6 +5343,7 @@ static void _free_event(struct perf_event *event)
security_perf_event_free(event);
if (event->rb) {
+ WARN_ON_ONCE(!pmu);
/*
* Can happen when we close an event with re-directed output.
*
@@ -5349,12 +5363,16 @@ static void _free_event(struct perf_event *event)
put_callchain_buffers();
}
- perf_event_free_bpf_prog(event);
- perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filter_ranges);
+ if (pmu) {
+ perf_event_free_bpf_prog(event);
+ perf_addr_filters_splice(event, NULL);
+ kfree(event->addr_filter_ranges);
+ }
- if (event->destroy)
+ if (event->destroy) {
+ WARN_ON_ONCE(!pmu);
event->destroy(event);
+ }
/*
* Must be after ->destroy(), due to uprobe_perf_close() using
@@ -5363,8 +5381,10 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
- if (event->pmu_ctx)
+ if (event->pmu_ctx) {
+ WARN_ON_ONCE(!pmu);
put_pmu_ctx(event->pmu_ctx);
+ }
/*
* perf_event_free_task() relies on put_ctx() being 'last', in particular
@@ -5373,8 +5393,14 @@ static void _free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
+ if (pmu) {
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+ scoped_guard(spinlock, &pmu->events_lock) {
+ list_del(&event->pmu_list);
+ wake_up_var(pmu);
+ }
+ }
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -5492,7 +5518,11 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ if (event->state > PERF_EVENT_STATE_REVOKED) {
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
+ } else {
+ event->state = PERF_EVENT_STATE_DEAD;
+ }
perf_event_ctx_unlock(event, ctx);
@@ -5803,7 +5833,7 @@ __perf_read(struct perf_event *event, char __user *buf, size_t count)
* error state (i.e. because it was pinned but it couldn't be
* scheduled on to the CPU at some point).
*/
- if (event->state == PERF_EVENT_STATE_ERROR)
+ if (event->state <= PERF_EVENT_STATE_ERROR)
return 0;
if (count < event->read_size)
@@ -5842,8 +5872,14 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
struct perf_buffer *rb;
__poll_t events = EPOLLHUP;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
poll_wait(file, &event->waitq, wait);
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return EPOLLERR;
+
if (is_event_hup(event))
return events;
@@ -6013,7 +6049,7 @@ static inline int perf_fget_light(int fd, struct fd *p)
}
static int perf_event_set_output(struct perf_event *event,
- struct perf_event *output_event);
+ struct perf_event *output_event, bool force);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr);
@@ -6023,6 +6059,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
void (*func)(struct perf_event *);
u32 flags = arg;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
@@ -6065,10 +6104,10 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
if (ret)
return ret;
output_event = fd_file(output)->private_data;
- ret = perf_event_set_output(event, output_event);
+ ret = perf_event_set_output(event, output_event, false);
fdput(output);
} else {
- ret = perf_event_set_output(event, NULL);
+ ret = perf_event_set_output(event, NULL, false);
}
return ret;
}
@@ -6472,6 +6511,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
unsigned long size = perf_data_size(rb);
bool detach_rest = false;
+ /* FIXIES vs perf_pmu_unregister() */
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
@@ -6591,7 +6631,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
- int ret = 0, flags = 0;
+ int ret, flags = 0;
+
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
+ /* XXX needs event->mmap_mutex? */
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6604,10 +6652,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
- ret = security_perf_event_read(event);
- if (ret)
- return ret;
-
vma_size = vma->vm_end - vma->vm_start;
if (vma->vm_pgoff == 0) {
@@ -6810,6 +6854,9 @@ static int perf_fasync(int fd, struct file *filp, int on)
struct perf_event *event = filp->private_data;
int retval;
+ if (event->state <= PERF_EVENT_STATE_REVOKED)
+ return -ENODEV;
+
inode_lock(inode);
retval = fasync_helper(fd, filp, on, &event->fasync);
inode_unlock(inode);
@@ -11513,6 +11560,7 @@ static int perf_event_idx_default(struct perf_event *event)
static void free_pmu_context(struct pmu *pmu)
{
+ free_percpu(pmu->pmu_disable_count);
free_percpu(pmu->cpu_pmu_context);
}
@@ -11753,6 +11801,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
if (type >= 0)
max = type;
+ // XXX broken vs perf_init_event(), this publishes before @pmu is finalized
ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
if (ret < 0)
goto free_pdc;
@@ -11809,8 +11858,14 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
- list_add_rcu(&pmu->entry, &pmus);
atomic_set(&pmu->exclusive_cnt, 0);
+ INIT_LIST_HEAD(&pmu->events);
+ spin_lock_init(&pmu->events_lock);
+
+ /*
+ * Publish the pmu after it is fully initialized.
+ */
+ list_add_rcu(&pmu->entry, &pmus);
ret = 0;
unlock:
mutex_unlock(&pmus_lock);
@@ -11832,10 +11887,110 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
-void perf_pmu_unregister(struct pmu *pmu)
+static void __pmu_detach_event(struct pmu *pmu, struct perf_event *event,
+ struct perf_event_context *ctx)
{
+ /*
+ * De-schedule the event and mark it EXIT_PMU.
+ */
+ perf_event_exit_event(event, ctx, true);
+
+ /*
+ * All _free_event() bits that rely on event->pmu:
+ */
+ perf_event_set_output(event, NULL, true);
+
+ perf_event_free_bpf_prog(event);
+ perf_addr_filters_splice(event, NULL);
+ kfree(event->addr_filter_ranges);
+
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+
+ if (event->pmu_ctx) {
+ /*
+ * Make sure pmu->cpu_pmu_context is unused. An alternative is
+ * to have it be pointers and make put_pmu_ctx()'s
+ * epc->embedded case be another RCU free case.
+ */
+ put_pmu_ctx(event->pmu_ctx);
+ event->pmu_ctx = NULL;
+ }
+
+ exclusive_event_destroy(event);
+ module_put(pmu->module);
+
+ event->pmu = NULL; /* force fault instead of UAF */
+}
+
+static void pmu_detach_event(struct pmu *pmu, struct perf_event *event)
+{
+ struct perf_event_context *ctx;
+
+ ctx = perf_event_ctx_lock(event);
+ __pmu_detach_event(pmu, event, ctx);
+ perf_event_ctx_unlock(event, ctx);
+
+ scoped_guard(spinlock, &pmu->events_lock)
+ list_del(&event->pmu_list);
+}
+
+static struct perf_event *pmu_get_event(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ guard(spinlock)(&pmu->events_lock);
+ list_for_each_entry(event, &pmu->events, pmu_list) {
+ if (atomic_long_inc_not_zero(&event->refcount))
+ return event;
+ }
+
+ return NULL;
+}
+
+static bool pmu_empty(struct pmu *pmu)
+{
+ guard(spinlock)(&pmu->events_lock);
+ return list_empty(&pmu->events);
+}
+
+static void pmu_detach_events(struct pmu *pmu)
+{
+ struct perf_event *event;
+
+ for (;;) {
+ event = pmu_get_event(pmu);
+ if (!event)
+ break;
+
+ pmu_detach_event(pmu, event);
+ put_event(event);
+ }
+
+ /*
+ * wait for pending _free_event()s
+ */
+ wait_var_event(pmu, pmu_empty(pmu));
ok... so now we are synchronosly moving the events to a revoked state
during unregister, so we wouldn't need the refcount on the driver side
anymore and can just free the objects right after return.
I will give this a try with i915 and/or xe.
thanks
Lucas De Marchi
+}
+
+int perf_pmu_unregister(struct pmu *pmu)
+{
+ /*
+ * FIXME!
+ *
+ * perf_mmap_close(); event->pmu->event_unmapped()
+ *
+ * XXX this check is racy vs perf_event_alloc()
+ */
+ if (pmu->event_unmapped && !pmu_empty(pmu))
+ return -EBUSY;
+
mutex_lock(&pmus_lock);
list_del_rcu(&pmu->entry);
+ idr_remove(&pmu_idr, pmu->type);
+ mutex_unlock(&pmus_lock);
/*
* We dereference the pmu list under both SRCU and regular RCU, so
@@ -11844,16 +11999,29 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
- free_percpu(pmu->pmu_disable_count);
- idr_remove(&pmu_idr, pmu->type);
+ /*
+ * PMU is removed from the pmus list, so no new events will
+ * be created, now take care of the existing ones.
+ */
+ pmu_detach_events(pmu);
+
+ /*
+ * PMU is unused, make it go away.
+ */
if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
if (pmu->nr_addr_filters)
device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
put_device(pmu->dev);
}
+
+ /*
+ * XXX -- broken? can still contain SW events at this point?
+ * audit more, make sure DETACH_GROUP DTRT
+ */
free_pmu_context(pmu);
- mutex_unlock(&pmus_lock);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12330,6 +12498,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
+ scoped_guard(spinlock, &pmu->events_lock)
+ list_add(&event->pmu_list, &pmu->events);
+
return event;
err_callchain_buffer:
@@ -12493,7 +12664,7 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
}
static int
-perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
+perf_event_set_output(struct perf_event *event, struct perf_event *output_event, bool force)
{
struct perf_buffer *rb = NULL;
int ret = -EINVAL;
@@ -12549,8 +12720,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
/* Can't redirect output if we've got an active mmap() */
- if (atomic_read(&event->mmap_count))
- goto unlock;
+ if (atomic_read(&event->mmap_count)) {
+ if (!force)
+ goto unlock;
+
+ WARN_ON_ONCE(event->pmu->event_unmapped);
+ }
if (output_event) {
/* get the rb we want to redirect to */
@@ -12740,6 +12915,10 @@ SYSCALL_DEFINE5(perf_event_open,
if (err)
goto err_fd;
group_leader = fd_file(group)->private_data;
+ if (group_leader->state <= PERF_EVENT_STATE_REVOKED) {
+ err = -ENODEV;
+ goto err_group_fd;
+ }
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -12916,7 +13095,7 @@ SYSCALL_DEFINE5(perf_event_open,
event->pmu_ctx = pmu_ctx;
if (output_event) {
- err = perf_event_set_output(event, output_event);
+ err = perf_event_set_output(event, output_event, false);
if (err)
goto err_context;
}
@@ -13287,10 +13466,11 @@ static void sync_child_event(struct perf_event *child_event)
}
static void
-perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
+perf_event_exit_event(struct perf_event *event,
+ struct perf_event_context *ctx, bool revoke)
{
struct perf_event *parent_event = event->parent;
- unsigned long detach_flags = 0;
+ unsigned long detach_flags = DETACH_EXIT;
if (parent_event) {
/*
@@ -13305,16 +13485,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- detach_flags = DETACH_GROUP | DETACH_CHILD;
+ detach_flags |= DETACH_GROUP | DETACH_CHILD;
mutex_lock(&parent_event->child_mutex);
}
- perf_remove_from_context(event, detach_flags);
+ if (revoke)
+ detach_flags |= DETACH_GROUP | DETACH_REVOKE;
- raw_spin_lock_irq(&ctx->lock);
- if (event->state > PERF_EVENT_STATE_EXIT)
- perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, detach_flags);
/*
* Child events can be freed.
@@ -13390,7 +13568,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
perf_event_task(child, child_ctx, 0);
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- perf_event_exit_event(child_event, child_ctx);
+ perf_event_exit_event(child_event, child_ctx, false);
mutex_unlock(&child_ctx->mutex);