On Mon, May 06, 2024 at 05:29:32AM +0000, Mingwei Zhang wrote: > @@ -5791,6 +5801,100 @@ void perf_put_mediated_pmu(void) > } > EXPORT_SYMBOL_GPL(perf_put_mediated_pmu); > > +static void perf_sched_out_exclude_guest(struct perf_event_context *ctx) > +{ > + struct perf_event_pmu_context *pmu_ctx; > + > + update_context_time(ctx); > + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { > + struct perf_event *event, *tmp; > + struct pmu *pmu = pmu_ctx->pmu; > + > + if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)) > + continue; > + > + perf_pmu_disable(pmu); > + > + /* > + * All active events must be exclude_guest events. > + * See perf_get_mediated_pmu(). > + * Unconditionally remove all active events. > + */ > + list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list) > + group_sched_out(event, pmu_ctx->ctx); > + > + list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list) > + group_sched_out(event, pmu_ctx->ctx); > + > + pmu_ctx->rotate_necessary = 0; > + > + perf_pmu_enable(pmu); > + } > +} > + > +/* When entering a guest, schedule out all exclude_guest events. */ > +void perf_guest_enter(void) > +{ > + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); > + > + lockdep_assert_irqs_disabled(); > + > + perf_ctx_lock(cpuctx, cpuctx->task_ctx); > + > + if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) { > + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); > + return; > + } > + > + perf_sched_out_exclude_guest(&cpuctx->ctx); > + if (cpuctx->task_ctx) > + perf_sched_out_exclude_guest(cpuctx->task_ctx); > + > + __this_cpu_write(perf_in_guest, true); > + > + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); > +} > + > +static void perf_sched_in_exclude_guest(struct perf_event_context *ctx) > +{ > + struct perf_event_pmu_context *pmu_ctx; > + > + update_context_time(ctx); > + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { > + struct pmu *pmu = pmu_ctx->pmu; > + > + if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)) > + continue; > + > + perf_pmu_disable(pmu); > + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu); > + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); > + perf_pmu_enable(pmu); > + } > +} > + > +void perf_guest_exit(void) > +{ > + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); > + > + lockdep_assert_irqs_disabled(); > + > + perf_ctx_lock(cpuctx, cpuctx->task_ctx); > + > + if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) { > + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); > + return; > + } > + > + __this_cpu_write(perf_in_guest, false); > + > + perf_sched_in_exclude_guest(&cpuctx->ctx); > + if (cpuctx->task_ctx) > + perf_sched_in_exclude_guest(cpuctx->task_ctx); > + > + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); > +} Bah, this is a ton of copy-paste from the normal scheduling code with random changes. Why ? Why can't this use ctx_sched_{in,out}() ? Surely the whole CAP_PASSTHROUGHT thing is but a flag away.