Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> --- include/linux/file.h | 3 kernel/events/core.c | 483 +++++++++++++++++++++++---------------------------- 2 files changed, 222 insertions(+), 264 deletions(-) --- a/include/linux/file.h +++ b/include/linux/file.h @@ -84,6 +84,7 @@ static inline void fdput_pos(struct fd f } DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd) +DEFINE_FREE(fdput, struct fd, fdput(_T)) extern int f_dupfd(unsigned int from, struct file *file, unsigned flags); extern int replace_fd(unsigned fd, struct file *file, unsigned flags); @@ -96,6 +97,8 @@ extern void put_unused_fd(unsigned int f DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) +#define no_free_fd(fd) ({ int __fd = (fd); (fd) = -1; __fd; }) + extern void fd_install(unsigned int fd, struct file *file); extern int __receive_fd(struct file *file, int __user *ufd, --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1163,9 +1163,10 @@ static void perf_assert_pmu_disabled(str WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } -static void get_ctx(struct perf_event_context *ctx) +static struct perf_event_context *get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); + return ctx; } static void *alloc_task_ctx_data(struct pmu *pmu) @@ -4672,9 +4673,6 @@ find_lively_task_by_vpid(pid_t vpid) get_task_struct(task); rcu_read_unlock(); - if (!task) - return ERR_PTR(-ESRCH); - return task; } @@ -4754,6 +4752,11 @@ find_get_context(struct task_struct *tas return ERR_PTR(err); } +DEFINE_CLASS(find_get_ctx, struct perf_event_context *, + if (!IS_ERR_OR_NULL(_T)) { perf_unpin_context(_T); put_ctx(_T); }, + find_get_context(task, event), + struct task_struct *task, struct perf_event *event) + /* * Returns a matching perf_event_pmu_context with elevated refcount or NULL. */ @@ -4836,9 +4839,10 @@ find_get_pmu_context(struct pmu *pmu, st return epc; } -static void get_pmu_ctx(struct perf_event_pmu_context *epc) +static struct perf_event_pmu_context *get_pmu_ctx(struct perf_event_pmu_context *epc) { WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); + return epc; } static void free_epc_rcu(struct rcu_head *head) @@ -4881,6 +4885,8 @@ static void put_pmu_ctx(struct perf_even call_rcu(&epc->rcu_head, free_epc_rcu); } +DEFINE_FREE(put_pmu_ctx, struct perf_event_pmu_context *, if (_T) put_pmu_ctx(_T)) + static void perf_event_free_filter(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) @@ -5190,6 +5196,8 @@ static void free_event(struct perf_event _free_event(event); } +DEFINE_FREE(free_event, struct perf_event *, if (!IS_ERR_OR_NULL(_T)) free_event(_T)) + /* * Remove user event from the owner task. */ @@ -5748,19 +5756,6 @@ EXPORT_SYMBOL_GPL(perf_event_period); static const struct file_operations perf_fops; -static inline struct fd perf_fdget(int fd) -{ - struct fd f = fdget(fd); - if (!f.file) - return fdnull; - - if (f.file->f_op != &perf_fops) { - fdput(f); - return fdnull; - } - return f; -} - static inline bool is_perf_fd(struct fd fd) { return fd.file && fd.file->f_op == &perf_fops; @@ -12189,19 +12184,16 @@ SYSCALL_DEFINE5(perf_event_open, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; - struct perf_event_pmu_context *pmu_ctx; - struct perf_event *event, *sibling; + struct perf_event *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx; struct file *event_file = NULL; - struct fd group = {NULL, 0}; - struct task_struct *task = NULL; + struct task_struct *task __free(put_task) = NULL; + struct fd group __free(fdput) = fdnull; struct pmu *pmu; - int event_fd; int move_group = 0; - int err; int f_flags = O_RDWR; int cgroup_fd = -1; + int err; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) @@ -12261,16 +12253,14 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & PERF_FLAG_FD_CLOEXEC) f_flags |= O_CLOEXEC; - event_fd = get_unused_fd_flags(f_flags); - if (event_fd < 0) - return event_fd; + CLASS(get_unused_fd, fd)(f_flags); + if (fd < 0) + return fd; if (group_fd != -1) { - group = perf_fdget(group_fd); - if (!group.file) { - err = -EBADF; - goto err_fd; - } + group = fdget(group_fd); + if (!is_perf_fd(group)) + return -EBADF; group_leader = group.file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; @@ -12280,33 +12270,26 @@ SYSCALL_DEFINE5(perf_event_open, if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); - if (IS_ERR(task)) { - err = PTR_ERR(task); - goto err_group_fd; - } + if (!task) + return -ESRCH; } if (task && group_leader && - group_leader->attr.inherit != attr.inherit) { - err = -EINVAL; - goto err_task; - } + group_leader->attr.inherit != attr.inherit) + return -EINVAL; if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; - event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, + struct perf_event *event __free(free_event) = + perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL, cgroup_fd); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_task; - } + if (IS_ERR(event)) + return PTR_ERR(event); if (is_sampling_event(event)) { - if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { - err = -EOPNOTSUPP; - goto err_alloc; - } + if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) + return -EOPNOTSUPP; } /* @@ -12318,266 +12301,238 @@ SYSCALL_DEFINE5(perf_event_open, if (attr.use_clockid) { err = perf_event_set_clock(event, attr.clockid); if (err) - goto err_alloc; + return err; } if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; - if (task) { - err = down_read_interruptible(&task->signal->exec_update_lock); - if (err) - goto err_alloc; + do { + struct rw_semaphore *exec_update_lock __free(up_read) = NULL; + if (task) { + err = down_read_interruptible(&task->signal->exec_update_lock); + if (err) + return err; + + exec_update_lock = &task->signal->exec_update_lock; + + /* + * We must hold exec_update_lock across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + if (!perf_check_permission(&attr, task)) + return -EACCES; + } /* - * We must hold exec_update_lock across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). + * Get the target context (task or percpu): */ - err = -EACCES; - if (!perf_check_permission(&attr, task)) - goto err_cred; - } + CLASS(find_get_ctx, ctx)(task, event); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(task, event); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_cred; - } - - mutex_lock(&ctx->mutex); + guard(mutex)(&ctx->mutex); - if (ctx->task == TASK_TOMBSTONE) { - err = -ESRCH; - goto err_locked; - } + if (ctx->task == TASK_TOMBSTONE) + return -ESRCH; - if (!task) { - /* - * Check if the @cpu we're creating an event for is online. - * - * We use the perf_cpu_context::ctx::mutex to serialize against - * the hotplug notifiers. See perf_event_{init,exit}_cpu(). - */ - struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); + if (!task) { + /* + * Check if the @cpu we're creating an event for is + * online. + * + * We use the perf_cpu_context::ctx::mutex to serialize + * against the hotplug notifiers. See + * perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = + per_cpu_ptr(&perf_cpu_context, event->cpu); - if (!cpuctx->online) { - err = -ENODEV; - goto err_locked; + if (!cpuctx->online) + return -ENODEV; } - } - if (group_leader) { - err = -EINVAL; + if (group_leader) { + err = -EINVAL; - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_locked; - - /* All events in a group should have the same clock */ - if (group_leader->clock != event->clock) - goto err_locked; + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling) + */ + if (group_leader->group_leader != group_leader) + return -EINVAL; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_locked; + /* All events in a group should have the same clock */ + if (group_leader->clock != event->clock) + return -EINVAL; - /* - * Make sure we're both on the same context; either task or cpu. - */ - if (group_leader->ctx != ctx) - goto err_locked; + /* + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. + */ + if (group_leader->cpu != event->cpu) + return -EINVAL; - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_locked; + /* + * Make sure we're both on the same context; either + * task or cpu. + */ + if (group_leader->ctx != ctx) + return -EINVAL; - if (is_software_event(event) && - !in_software_context(group_leader)) { /* - * If the event is a sw event, but the group_leader - * is on hw context. - * - * Allow the addition of software events to hw - * groups, this is safe because software events - * never fail to schedule. - * - * Note the comment that goes with struct - * perf_event_pmu_context. + * Only a group leader can be exclusive or pinned */ - pmu = group_leader->pmu_ctx->pmu; - } else if (!is_software_event(event)) { - if (is_software_event(group_leader) && - (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + if (attr.exclusive || attr.pinned) + return -EINVAL; + + if (is_software_event(event) && + !in_software_context(group_leader)) { + /* + * If the event is a sw event, but the + * group_leader is on hw context. + * + * Allow the addition of software events to hw + * groups, this is safe because software events + * never fail to schedule. + * + * Note the comment that goes with struct + * perf_event_pmu_context. + */ + pmu = group_leader->pmu_ctx->pmu; + } else if (!is_software_event(event)) { + if (is_software_event(group_leader) && + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * In case the group is a pure software + * group, and we try to add a hardware + * event, move the whole group to the + * hardware context. + */ + move_group = 1; + } + /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. + * Don't allow group of multiple hw events from + * different pmus */ - move_group = 1; + if (!in_software_context(group_leader) && + group_leader->pmu_ctx->pmu != pmu) + return -EINVAL; } + } + + /* + * Now that we're certain of the pmu; find the pmu_ctx. + */ + struct perf_event_pmu_context *pmu_ctx __free(put_pmu_ctx) = + find_get_pmu_context(pmu, ctx, event); + if (!pmu_ctx) + return -ENOMEM; - /* Don't allow group of multiple hw events from different pmus */ - if (!in_software_context(group_leader) && - group_leader->pmu_ctx->pmu != pmu) - goto err_locked; + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + return err; } - } - /* - * Now that we're certain of the pmu; find the pmu_ctx. - */ - pmu_ctx = find_get_pmu_context(pmu, ctx, event); - if (IS_ERR(pmu_ctx)) { - err = PTR_ERR(pmu_ctx); - goto err_locked; - } - event->pmu_ctx = pmu_ctx; + if (!perf_event_validate_size(event)) + return -E2BIG; - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } + if (perf_need_aux_event(event) && + !perf_get_aux_event(event, group_leader)) + return -EINVAL; - if (!perf_event_validate_size(event)) { - err = -E2BIG; - goto err_context; - } + /* + * Must be under the same ctx::mutex as perf_install_in_context(), + * because we need to serialize with concurrent event creation. + */ + if (!exclusive_event_installable(event, ctx)) + return -EBUSY; - if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { - err = -EINVAL; - goto err_context; - } + WARN_ON_ONCE(ctx->parent_ctx); - /* - * Must be under the same ctx::mutex as perf_install_in_context(), - * because we need to serialize with concurrent event creation. - */ - if (!exclusive_event_installable(event, ctx)) { - err = -EBUSY; - goto err_context; - } + event_file = anon_inode_getfile("[perf_event]", &perf_fops, + event, f_flags); + if (IS_ERR(event_file)) + return PTR_ERR(event_file); - WARN_ON_ONCE(ctx->parent_ctx); + /* + * The event is now owned by event_file and will be cleaned up + * through perf_fops::release(). Similarly the fd will be linked + * to event_file and should not be put_unused_fd(). + */ - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - event_file = NULL; - goto err_context; - } + /* + * This is the point on no return; we cannot fail hereafter. This is + * where we start modifying current state. + */ - /* - * This is the point on no return; we cannot fail hereafter. This is - * where we start modifying current state. - */ + if (move_group) { + /* + * Moves the events from one pmu to another, hence we need + * to update the pmu_ctx, but through all this the ctx + * stays the same. + */ + perf_remove_from_context(group_leader, 0); + put_pmu_ctx(group_leader->pmu_ctx); - if (move_group) { - perf_remove_from_context(group_leader, 0); - put_pmu_ctx(group_leader->pmu_ctx); + for_each_sibling_event(sibling, group_leader) { + perf_remove_from_context(sibling, 0); + put_pmu_ctx(sibling->pmu_ctx); + } - for_each_sibling_event(sibling, group_leader) { - perf_remove_from_context(sibling, 0); - put_pmu_ctx(sibling->pmu_ctx); - } + /* + * Install the group siblings before the group leader. + * + * Because a group leader will try and install the entire group + * (through the sibling list, which is still in-tact), we can + * end up with siblings installed in the wrong context. + * + * By installing siblings first we NO-OP because they're not + * reachable through the group lists. + */ + for_each_sibling_event(sibling, group_leader) { + sibling->pmu_ctx = get_pmu_ctx(pmu_ctx); + perf_event__state_init(sibling); + perf_install_in_context(ctx, sibling, sibling->cpu); + } - /* - * Install the group siblings before the group leader. - * - * Because a group leader will try and install the entire group - * (through the sibling list, which is still in-tact), we can - * end up with siblings installed in the wrong context. - * - * By installing siblings first we NO-OP because they're not - * reachable through the group lists. - */ - for_each_sibling_event(sibling, group_leader) { - sibling->pmu_ctx = pmu_ctx; - get_pmu_ctx(pmu_ctx); - perf_event__state_init(sibling); - perf_install_in_context(ctx, sibling, sibling->cpu); + /* + * Removing from the context ends up with disabled + * event. What we want here is event in the initial + * startup state, ready to be add into new context. + */ + group_leader->pmu_ctx = get_pmu_ctx(pmu_ctx); + perf_event__state_init(group_leader); + perf_install_in_context(ctx, group_leader, group_leader->cpu); } /* - * Removing from the context ends up with disabled - * event. What we want here is event in the initial - * startup state, ready to be add into new context. + * Precalculate sample_data sizes; do while holding ctx::mutex such + * that we're serialized against further additions and before + * perf_install_in_context() which is the point the event is active and + * can use these values. */ - group_leader->pmu_ctx = pmu_ctx; - get_pmu_ctx(pmu_ctx); - perf_event__state_init(group_leader); - perf_install_in_context(ctx, group_leader, group_leader->cpu); - } + perf_event__header_size(event); + perf_event__id_header_size(event); - /* - * Precalculate sample_data sizes; do while holding ctx::mutex such - * that we're serialized against further additions and before - * perf_install_in_context() which is the point the event is active and - * can use these values. - */ - perf_event__header_size(event); - perf_event__id_header_size(event); + event->owner = current; - event->owner = current; + event->pmu_ctx = no_free_ptr(pmu_ctx); + perf_install_in_context(get_ctx(ctx), event, event->cpu); + } while (0); - perf_install_in_context(ctx, event, event->cpu); - perf_unpin_context(ctx); + scoped_guard (mutex, ¤t->perf_event_mutex) + list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(&ctx->mutex); + fd_install(fd, event_file); - if (task) { - up_read(&task->signal->exec_update_lock); - put_task_struct(task); - } - - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - - /* - * Drop the reference on the group_event after placing the - * new event on the sibling_list. This ensures destruction - * of the group leader will find the pointer to itself in - * perf_group_detach(). - */ - fdput(group); - fd_install(event_fd, event_file); - return event_fd; - -err_context: - put_pmu_ctx(event->pmu_ctx); - event->pmu_ctx = NULL; /* _free_event() */ -err_locked: - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); -err_cred: - if (task) - up_read(&task->signal->exec_update_lock); -err_alloc: - free_event(event); -err_task: - if (task) - put_task_struct(task); -err_group_fd: - fdput(group); -err_fd: - put_unused_fd(event_fd); - return err; + no_free_ptr(event); + return no_free_fd(fd); } /**