On Wed, Jun 12, 2024 at 09:38:06AM -0400, Liang, Kan wrote: > On 2024-06-12 7:17 a.m., Peter Zijlstra wrote: > > On Tue, Jun 11, 2024 at 09:27:46AM -0400, Liang, Kan wrote: > >> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h > >> index dd4920bf3d1b..68c8b93c4e5c 100644 > >> --- a/include/linux/perf_event.h > >> +++ b/include/linux/perf_event.h > >> @@ -945,6 +945,7 @@ struct perf_event_context { > >> u64 time; > >> u64 timestamp; > >> u64 timeoffset; > >> + u64 timeguest; > >> > >> /* > >> * These fields let us detect when two contexts have both > > > >> @@ -651,10 +653,26 @@ __perf_update_times(struct perf_event *event, u64 > >> now, u64 *enabled, u64 *runnin > >> > >> static void perf_event_update_time(struct perf_event *event) > >> { > >> - u64 now = perf_event_time(event); > >> + u64 now; > >> + > >> + /* Never count the time of an active guest into an exclude_guest event. */ > >> + if (event->ctx->timeguest && > >> + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) { > >> + /* > >> + * If a guest is running, use the timestamp while entering the guest. > >> + * If the guest is leaving, reset the event timestamp. > >> + */ > >> + if (__this_cpu_read(perf_in_guest)) > >> + event->tstamp = event->ctx->timeguest; > >> + else > >> + event->tstamp = event->ctx->time; > >> + return; > >> + } > >> > >> + now = perf_event_time(event); > >> __perf_update_times(event, now, &event->total_time_enabled, > >> &event->total_time_running); > >> + > >> event->tstamp = now; > >> } > > > > So I really don't like this much, > > An alternative way I can imagine may maintain a dedicated timeline for > the PASSTHROUGH PMUs. For that, we probably need two new timelines for > the normal events and the cgroup events. That sounds too complex. I'm afraid we might have to. Specifically, the below: > diff --git a/kernel/events/core.c b/kernel/events/core.c > index 019c237dd456..6c46699c6752 100644 > --- a/kernel/events/core.c > +++ b/kernel/events/core.c > @@ -665,7 +665,7 @@ static void perf_event_update_time(struct perf_event > *event) > if (__this_cpu_read(perf_in_guest)) > event->tstamp = event->ctx->timeguest; > else > - event->tstamp = event->ctx->time; > + event->tstamp = perf_event_time(event); > return; > } is still broken in that it (ab)uses event state to track time, and this goes sideways in case of event overcommit, because then ctx_sched_{out,in}() will not visit all events. We've ran into that before. Time-keeping really should be per context or we'll get a ton of pain. I've ended up with the (uncompiled) below. Yes, it is unfortunate, but aside from a few cleanups (we could introduce a struct time_ctx { u64 time, stamp, offset }; and fold a bunch of code, this is more or less the best we can do I'm afraid. --- --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -947,7 +947,9 @@ struct perf_event_context { u64 time; u64 timestamp; u64 timeoffset; - u64 timeguest; + u64 guest_time; + u64 guest_timestamp; + u64 guest_timeoffset; /* * These fields let us detect when two contexts have both @@ -1043,6 +1045,9 @@ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; + u64 guest_time; + u64 guest_timestamp; + u64 guest_timeoffset; int active; }; --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -638,26 +638,9 @@ __perf_update_times(struct perf_event *e static void perf_event_update_time(struct perf_event *event) { - u64 now; - - /* Never count the time of an active guest into an exclude_guest event. */ - if (event->ctx->timeguest && - event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) { - /* - * If a guest is running, use the timestamp while entering the guest. - * If the guest is leaving, reset the event timestamp. - */ - if (__this_cpu_read(perf_in_guest)) - event->tstamp = event->ctx->timeguest; - else - event->tstamp = event->ctx->time; - return; - } - - now = perf_event_time(event); + u64 now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); - event->tstamp = now; } @@ -780,19 +763,33 @@ static inline int is_cgroup_event(struct static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; + u64 time; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; + time = t->time; + if (event->attr.exclude_guest) + time -= t->guest_time; + return time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { struct perf_cgroup_info *t; + u64 time, guest_time; t = per_cpu_ptr(event->cgrp->info, event->cpu); - if (!__load_acquire(&t->active)) - return t->time; - now += READ_ONCE(t->timeoffset); + if (!__load_acquire(&t->active)) { + time = t->time; + if (event->attr.exclude_guest) + time -= t->guest_time; + return time; + } + + time = now + READ_ONCE(t->timeoffset); + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) { + guest_time = now + READ_ONCE(t->guest_offset); + time -= guest_time; + } return now; } @@ -807,6 +804,17 @@ static inline void __update_cgrp_time(st WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->guest_time += now - info->guest_timestamp; + info->guest_timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->guest_timeoffset, info->guest_time - info->guest_timestamp); +} + static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -821,6 +829,8 @@ static inline void update_cgrp_time_from info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, now, true); + if (__this_cpu_read(perf_in_guest)) + __update_cgrp_guest_time(info, now, true); if (final) __store_release(&info->active, 0); } @@ -1501,14 +1511,39 @@ static void __update_context_time(struct WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) +{ + u64 now = ctx->timestamp; /* must be called after __update_context_time(); */ + + lockdep_assert_held(&ctx->lock); + + if (adv) + ctx->guest_time += now - ctx->guest_timestamp; + ctx->guest_timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->guest_timeoffset, ctx->guest_time - ctx->guest_timestamp); +} + static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); + if (__this_cpu_read(perf_in_guest)) + __update_context_guest_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + u64 time; if (unlikely(!ctx)) return 0; @@ -1516,12 +1551,17 @@ static u64 perf_event_time(struct perf_e if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time; + time = ctx->time; + if (event->attr.exclude_guest) + time -= ctx->guest_time; + + return time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; + u64 time, guest_time; if (unlikely(!ctx)) return 0; @@ -1529,11 +1569,19 @@ static u64 perf_event_time_now(struct pe if (is_cgroup_event(event)) return perf_cgroup_event_time_now(event, now); - if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time; + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) { + time = ctx->time; + if (event->attr.exclude_guest) + time -= ctx->guest_time; + return time; + } - now += READ_ONCE(ctx->timeoffset); - return now; + time = now + READ_ONCE(ctx->timeoffset); + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) { + guest_time = now + READ_ONCE(ctx->guest_timeoffset); + time -= guest_time; + } + return time; } static enum event_type_t get_event_type(struct perf_event *event) @@ -3340,9 +3388,14 @@ ctx_sched_out(struct perf_event_context * would only update time for the pinned events. */ if (is_active & EVENT_TIME) { + bool stop; + + stop = !((ctx->is_active & event_type) & EVENT_ALL) && + ctx == &cpuctx->ctx; + /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + update_cgrp_time_from_cpuctx(cpuctx, stop); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -3366,8 +3419,12 @@ ctx_sched_out(struct perf_event_context * with PERF_PMU_CAP_PASSTHROUGH_VPMU. */ is_active = EVENT_ALL; - } else + __update_context_guest_time(ctx, false); + perf_cgroup_set_guest_timestamp(cpuctx); + barrier(); + } else { is_active ^= ctx->is_active; /* changed bits */ + } list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (perf_skip_pmu_ctx(pmu_ctx, event_type)) @@ -3866,10 +3923,15 @@ static inline void group_update_userpage event_update_userpage(event); } +struct merge_sched_data { + int can_add_hw; + enum event_type_t event_type; +}; + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - int *can_add_hw = data; + struct merge_sched_data *msd = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -3881,18 +3943,18 @@ static int merge_sched_in(struct perf_ev * Don't schedule in any exclude_guest events of PMU with * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running. */ - if (__this_cpu_read(perf_in_guest) && - event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU && - event->attr.exclude_guest) + if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest) && + (event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) && + !(msd->event_type & EVENT_GUEST)) return 0; - if (group_can_go_on(event, *can_add_hw)) { + if (group_can_go_on(event, msd->can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - *can_add_hw = 0; + msd->can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -3911,11 +3973,15 @@ static int merge_sched_in(struct perf_ev static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu) + struct pmu *pmu, + enum even_type_t event_type) { - int can_add_hw = 1; + struct merge_sched_data msd = { + .can_add_hw = 1, + .event_type = event_type, + }; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); + merge_sched_in, &msd); } static void ctx_groups_sched_in(struct perf_event_context *ctx, @@ -3927,14 +3993,14 @@ static void ctx_groups_sched_in(struct p list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (perf_skip_pmu_ctx(pmu_ctx, event_type)) continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type); } } static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0); } static void @@ -3949,6 +4015,8 @@ ctx_sched_in(struct perf_event_context * return; if (!(is_active & EVENT_TIME)) { + /* EVENT_TIME should be active while the guest runs */ + WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); perf_cgroup_set_timestamp(cpuctx); @@ -3979,8 +4047,11 @@ ctx_sched_in(struct perf_event_context * * the exclude_guest events. */ update_context_time(ctx); - } else + update_cgrp_time_from_cpuctx(cpuctx, false); + barrier(); + } else { is_active ^= ctx->is_active; /* changed bits */ + } /* * First go through the list and put on any pinned groups @@ -5832,25 +5903,20 @@ void perf_guest_enter(void) perf_ctx_lock(cpuctx, cpuctx->task_ctx); - if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) { - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - return; - } + if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) + goto unlock; perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); ctx_sched_out(&cpuctx->ctx, EVENT_GUEST); - /* Set the guest start time */ - cpuctx->ctx.timeguest = cpuctx->ctx.time; perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); if (cpuctx->task_ctx) { perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST); - cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time; perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); } __this_cpu_write(perf_in_guest, true); - +unlock: perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } @@ -5862,24 +5928,21 @@ void perf_guest_exit(void) perf_ctx_lock(cpuctx, cpuctx->task_ctx); - if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) { - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - return; - } - - __this_cpu_write(perf_in_guest, false); + if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) + goto unlock; perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); ctx_sched_in(&cpuctx->ctx, EVENT_GUEST); - cpuctx->ctx.timeguest = 0; perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); if (cpuctx->task_ctx) { perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST); - cpuctx->task_ctx->timeguest = 0; perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); } + __this_cpu_write(perf_in_guest, false); + +unlock: perf_ctx_unlock(cpuctx, cpuctx->task_ctx); }