On 2024-05-07 4:58 a.m., Peter Zijlstra wrote: > On Mon, May 06, 2024 at 05:29:32AM +0000, Mingwei Zhang wrote: > >> @@ -5791,6 +5801,100 @@ void perf_put_mediated_pmu(void) >> } >> EXPORT_SYMBOL_GPL(perf_put_mediated_pmu); >> >> +static void perf_sched_out_exclude_guest(struct perf_event_context *ctx) >> +{ >> + struct perf_event_pmu_context *pmu_ctx; >> + >> + update_context_time(ctx); >> + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { >> + struct perf_event *event, *tmp; >> + struct pmu *pmu = pmu_ctx->pmu; >> + >> + if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)) >> + continue; >> + >> + perf_pmu_disable(pmu); >> + >> + /* >> + * All active events must be exclude_guest events. >> + * See perf_get_mediated_pmu(). >> + * Unconditionally remove all active events. >> + */ >> + list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list) >> + group_sched_out(event, pmu_ctx->ctx); >> + >> + list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list) >> + group_sched_out(event, pmu_ctx->ctx); >> + >> + pmu_ctx->rotate_necessary = 0; >> + >> + perf_pmu_enable(pmu); >> + } >> +} >> + >> +/* When entering a guest, schedule out all exclude_guest events. */ >> +void perf_guest_enter(void) >> +{ >> + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); >> + >> + lockdep_assert_irqs_disabled(); >> + >> + perf_ctx_lock(cpuctx, cpuctx->task_ctx); >> + >> + if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) { >> + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); >> + return; >> + } >> + >> + perf_sched_out_exclude_guest(&cpuctx->ctx); >> + if (cpuctx->task_ctx) >> + perf_sched_out_exclude_guest(cpuctx->task_ctx); >> + >> + __this_cpu_write(perf_in_guest, true); >> + >> + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); >> +} >> + >> +static void perf_sched_in_exclude_guest(struct perf_event_context *ctx) >> +{ >> + struct perf_event_pmu_context *pmu_ctx; >> + >> + update_context_time(ctx); >> + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { >> + struct pmu *pmu = pmu_ctx->pmu; >> + >> + if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)) >> + continue; >> + >> + perf_pmu_disable(pmu); >> + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu); >> + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); >> + perf_pmu_enable(pmu); >> + } >> +} >> + >> +void perf_guest_exit(void) >> +{ >> + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); >> + >> + lockdep_assert_irqs_disabled(); >> + >> + perf_ctx_lock(cpuctx, cpuctx->task_ctx); >> + >> + if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) { >> + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); >> + return; >> + } >> + >> + __this_cpu_write(perf_in_guest, false); >> + >> + perf_sched_in_exclude_guest(&cpuctx->ctx); >> + if (cpuctx->task_ctx) >> + perf_sched_in_exclude_guest(cpuctx->task_ctx); >> + >> + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); >> +} > > Bah, this is a ton of copy-paste from the normal scheduling code with > random changes. Why ? > > Why can't this use ctx_sched_{in,out}() ? Surely the whole > CAP_PASSTHROUGHT thing is but a flag away. > Not just a flag. The time has to be updated as well, since the ctx->time is shared among PMUs. Perf shouldn't stop it while other PMUs is still running. A timeguest will be introduced to track the start time of a guest. The event->tstamp of an exclude_guest event should always keep ctx->timeguest while a guest is running. When a guest is leaving, update the event->tstamp to now, so the guest time can be deducted. The below patch demonstrate how the timeguest works. (It's an incomplete patch. Just to show the idea.) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 22d3e56682c9..2134e6886e22 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -953,6 +953,7 @@ struct perf_event_context { u64 time; u64 timestamp; u64 timeoffset; + u64 timeguest; /* * These fields let us detect when two contexts have both diff --git a/kernel/events/core.c b/kernel/events/core.c index 14fd881e3e1d..2aed56671a24 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -690,12 +690,31 @@ __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *runnin *running += delta; } +static void perf_event_update_time_guest(struct perf_event *event) +{ + /* + * If a guest is running, use the timestamp while entering the guest. + * If the guest is leaving, reset the event timestamp. + */ + if (!__this_cpu_read(perf_in_guest)) + event->tstamp = event->ctx->time; + else + event->tstamp = event->ctx->timeguest; +} + static void perf_event_update_time(struct perf_event *event) { - u64 now = perf_event_time(event); + u64 now; + + /* Never count the time of an active guest into an exclude_guest event. */ + if (event->ctx->timeguest && + event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) + return perf_event_update_time_guest(event); + now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); + event->tstamp = now; } @@ -3398,7 +3417,14 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all !exclude_guest events of PMU + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. + */ + is_active = EVENT_ALL; + } else + is_active ^= ctx->is_active; /* changed bits */ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (perf_skip_pmu_ctx(pmu_ctx, event_type)) @@ -3998,7 +4024,20 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in all !exclude_guest events of PMU + * with PERF_PMU_CAP_PASSTHROUGH_VPMU. + */ + is_active = EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + } else + is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups @@ -5894,14 +5933,18 @@ void perf_guest_enter(u32 guest_lvtpc) } perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST); + ctx_sched_out(&cpuctx->ctx, EVENT_GUEST); + /* Set the guest start time */ + cpuctx->ctx.timeguest = cpuctx->ctx.time; perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); if (cpuctx->task_ctx) { perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); - task_ctx_sched_out(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST); + task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST); + cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time; perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); } __this_cpu_write(perf_in_guest, true); @@ -5925,14 +5968,17 @@ void perf_guest_exit(void) __this_cpu_write(perf_in_guest, false); perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); - ctx_sched_in(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST); + ctx_sched_in(&cpuctx->ctx, EVENT_GUEST); + cpuctx->ctx.timeguest = 0; perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); if (cpuctx->task_ctx) { perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); - ctx_sched_in(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST); + ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST); + cpuctx->task_ctx->timeguest = 0; perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); } Thanks Kan