On Fri, Jan 26, 2024, Xiong Zhang wrote: > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 683dc086ef10..59471eeec7e4 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -3803,6 +3803,8 @@ static inline void group_update_userpage(struct perf_event *group_event) > event_update_userpage(event); > } > > +static DEFINE_PER_CPU(bool, __perf_force_exclude_guest); > + > static int merge_sched_in(struct perf_event *event, void *data) > { > struct perf_event_context *ctx = event->ctx; > @@ -3814,6 +3816,14 @@ static int merge_sched_in(struct perf_event *event, void *data) > if (!event_filter_match(event)) > return 0; > > + /* > + * The __perf_force_exclude_guest indicates entering the guest. > + * No events of the passthrough PMU should be scheduled. > + */ > + if (__this_cpu_read(__perf_force_exclude_guest) && > + has_vpmu_passthrough_cap(event->pmu)) As mentioned in the previous reply, I think perf should WARN and reject any attempt to trigger a "passthrough" context switch if such a switch isn't supported by perf, not silently let it go through and then skip things later. > + return 0; > + > if (group_can_go_on(event, *can_add_hw)) { > if (!group_sched_in(event, ctx)) > list_add_tail(&event->active_list, get_event_list(event)); ... > +/* > + * When a guest enters, force all active events of the PMU, which supports > + * the VPMU_PASSTHROUGH feature, to be scheduled out. The events of other > + * PMUs, such as uncore PMU, should not be impacted. The guest can > + * temporarily own all counters of the PMU. > + * During the period, all the creation of the new event of the PMU with > + * !exclude_guest are error out. > + */ > +void perf_guest_enter(void) > +{ > + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); > + > + lockdep_assert_irqs_disabled(); > + > + if (__this_cpu_read(__perf_force_exclude_guest)) This should be a WARN_ON_ONCE, no? > + return; > + > + perf_ctx_lock(cpuctx, cpuctx->task_ctx); > + > + perf_force_exclude_guest_enter(&cpuctx->ctx); > + if (cpuctx->task_ctx) > + perf_force_exclude_guest_enter(cpuctx->task_ctx); > + > + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); > + > + __this_cpu_write(__perf_force_exclude_guest, true); > +} > +EXPORT_SYMBOL_GPL(perf_guest_enter); > + > +static void perf_force_exclude_guest_exit(struct perf_event_context *ctx) > +{ > + struct perf_event_pmu_context *pmu_ctx; > + struct pmu *pmu; > + > + update_context_time(ctx); > + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { > + pmu = pmu_ctx->pmu; > + if (!has_vpmu_passthrough_cap(pmu)) > + continue; I don't see how we can sanely support a CPU that doesn't support writable PERF_GLOBAL_STATUS across all PMUs. > + > + perf_pmu_disable(pmu); > + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu); > + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); > + perf_pmu_enable(pmu); > + } > +} > + > +void perf_guest_exit(void) > +{ > + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); > + > + lockdep_assert_irqs_disabled(); > + > + if (!__this_cpu_read(__perf_force_exclude_guest)) WARN_ON_ONCE here too? > + return; > + > + __this_cpu_write(__perf_force_exclude_guest, false); > + > + perf_ctx_lock(cpuctx, cpuctx->task_ctx); > + > + perf_force_exclude_guest_exit(&cpuctx->ctx); > + if (cpuctx->task_ctx) > + perf_force_exclude_guest_exit(cpuctx->task_ctx); > + > + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); > +} > +EXPORT_SYMBOL_GPL(perf_guest_exit); > + > +static inline int perf_force_exclude_guest_check(struct perf_event *event, > + int cpu, struct task_struct *task) > +{ > + bool *force_exclude_guest = NULL; > + > + if (!has_vpmu_passthrough_cap(event->pmu)) > + return 0; > + > + if (event->attr.exclude_guest) > + return 0; > + > + if (cpu != -1) { > + force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, cpu); > + } else if (task && (task->flags & PF_VCPU)) { > + /* > + * Just need to check the running CPU in the event creation. If the > + * task is moved to another CPU which supports the force_exclude_guest. > + * The event will filtered out and be moved to the error stage. See > + * merge_sched_in(). > + */ > + force_exclude_guest = per_cpu_ptr(&__perf_force_exclude_guest, task_cpu(task)); > + } These checks are extremely racy, I don't see how this can possibly do the right thing. PF_VCPU isn't a "this is a vCPU task", it's a "this task is about to do VM-Enter, or just took a VM-Exit" (the "I'm a virtual CPU" comment in include/linux/sched.h is wildly misleading, as it's _only_ valid when accounting time slices). Digging deeper, I think __perf_force_exclude_guest has similar problems, e.g. perf_event_create_kernel_counter() calls perf_event_alloc() before acquiring the per-CPU context mutex. > + if (force_exclude_guest && *force_exclude_guest) > + return -EBUSY; > + return 0; > +} > + > /* > * Holding the top-level event's child_mutex means that any > * descendant process that has inherited this event will block > @@ -11973,6 +12142,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, > goto err_ns; > } > > + if (perf_force_exclude_guest_check(event, cpu, task)) { This should be: err = perf_force_exclude_guest_check(event, cpu, task); if (err) goto err_pmu; i.e. shouldn't effectively ignore/override the return result. > + err = -EBUSY; > + goto err_pmu; > + } > + > /* > * Disallow uncore-task events. Similarly, disallow uncore-cgroup > * events (they don't make sense as the cgroup will be different > -- > 2.34.1 >