Current perf doesn't explicitly schedule out all exclude_guest events while the guest is running. There is no problem with the current emulated vPMU. Because perf owns all the PMU counters. It can mask the counter which is assigned to an exclude_guest event when a guest is running (Intel way), or set the corresponding HOSTONLY bit in evsentsel (AMD way). The counter doesn't count when a guest is running. However, either way doesn't work with the introduced passthrough vPMU. A guest owns all the PMU counters when it's running. The host should not mask any counters. The counter may be used by the guest. The evsentsel may be overwritten. Perf should explicitly schedule out all exclude_guest events to release the PMU resources when entering a guest, and resume the counting when exiting the guest. Expose two interfaces to KVM. The KVM should notify the perf when entering/exiting a guest. Introduce a new event type EVENT_CGUEST to indicate that perf should check and skip the PMUs which doesn't support the passthrough mode. It's possible that an exclude_guest event is created when a guest is running. The new event should not be scheduled in as well. The ctx->time is used to calculated the running/enabling time of an event, which is shared among PMUs. The ctx_sched_in/out() with EVENT_CGUEST doesn't stop the ctx->time. A timeguest is introduced to track the start time of a guest. For an exclude_guest event, the time in the guest mode is deducted. Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx> --- include/linux/perf_event.h | 5 ++ kernel/events/core.c | 119 +++++++++++++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dd4920bf3d1b..68c8b93c4e5c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -945,6 +945,7 @@ struct perf_event_context { u64 time; u64 timestamp; u64 timeoffset; + u64 timeguest; /* * These fields let us detect when two contexts have both @@ -1734,6 +1735,8 @@ extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); extern int perf_get_mediated_pmu(void); extern void perf_put_mediated_pmu(void); +void perf_guest_enter(void); +void perf_guest_exit(void); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, @@ -1826,6 +1829,8 @@ static inline int perf_get_mediated_pmu(void) } static inline void perf_put_mediated_pmu(void) { } +static inline void perf_guest_enter(void) { } +static inline void perf_guest_exit(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/kernel/events/core.c b/kernel/events/core.c index 95d1d5a5addc..cd3a89672b14 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -376,7 +376,8 @@ enum event_type_t { /* see ctx_resched() for details */ EVENT_CPU = 0x8, EVENT_CGROUP = 0x10, - EVENT_FLAGS = EVENT_CGROUP, + EVENT_GUEST = 0x20, + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -407,6 +408,7 @@ static atomic_t nr_include_guest_events __read_mostly; static atomic_t nr_mediated_pmu_vms; static DEFINE_MUTEX(perf_mediated_pmu_mutex); +static DEFINE_PER_CPU(bool, perf_in_guest); /* !exclude_guest event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */ static inline bool is_include_guest_event(struct perf_event *event) @@ -651,10 +653,26 @@ __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *runnin static void perf_event_update_time(struct perf_event *event) { - u64 now = perf_event_time(event); + u64 now; + + /* Never count the time of an active guest into an exclude_guest event. */ + if (event->ctx->timeguest && + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) { + /* + * If a guest is running, use the timestamp while entering the guest. + * If the guest is leaving, reset the event timestamp. + */ + if (__this_cpu_read(perf_in_guest)) + event->tstamp = event->ctx->timeguest; + else + event->tstamp = event->ctx->time; + return; + } + now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); + event->tstamp = now; } @@ -706,6 +724,10 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) return true; + if ((event_type & EVENT_GUEST) && + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)) + return true; + return false; } @@ -3350,7 +3372,14 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all !exclude_guest events of PMU + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. + */ + is_active = EVENT_ALL; + } else + is_active ^= ctx->is_active; /* changed bits */ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (perf_skip_pmu_ctx(pmu_ctx, event_type)) @@ -3860,6 +3889,15 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; + /* + * Don't schedule in any exclude_guest events of PMU with + * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running. + */ + if (__this_cpu_read(perf_in_guest) && + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU && + event->attr.exclude_guest) + return 0; + if (group_can_go_on(event, *can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); @@ -3941,7 +3979,20 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in all !exclude_guest events of PMU + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. + */ + is_active = EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + } else + is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups @@ -5788,6 +5839,66 @@ void perf_put_mediated_pmu(void) } EXPORT_SYMBOL_GPL(perf_put_mediated_pmu); +/* When entering a guest, schedule out all exclude_guest events. */ +void perf_guest_enter(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) { + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + return; + } + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + ctx_sched_out(&cpuctx->ctx, EVENT_GUEST); + /* Set the guest start time */ + cpuctx->ctx.timeguest = cpuctx->ctx.time; + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) { + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST); + cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time; + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + } + + __this_cpu_write(perf_in_guest, true); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); +} + +void perf_guest_exit(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) { + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + return; + } + + __this_cpu_write(perf_in_guest, false); + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + ctx_sched_in(&cpuctx->ctx, EVENT_GUEST); + cpuctx->ctx.timeguest = 0; + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) { + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST); + cpuctx->task_ctx->timeguest = 0; + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + } + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); +} + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block Thanks, Kan