Re: [PATCH v2 07/54] perf: Add generic exclude_guest support

"Liang, Kan" <kan.liang@xxxxxxxxxxxxxxx> · Mon, 10 Jun 2024 13:23:04 -0400

On 2024-05-07 4:58 a.m., Peter Zijlstra wrote:
> On Mon, May 06, 2024 at 05:29:32AM +0000, Mingwei Zhang wrote:
> 
>> @@ -5791,6 +5801,100 @@ void perf_put_mediated_pmu(void)
>>  }
>>  EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>>  
>> +static void perf_sched_out_exclude_guest(struct perf_event_context *ctx)
>> +{
>> +	struct perf_event_pmu_context *pmu_ctx;
>> +
>> +	update_context_time(ctx);
>> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>> +		struct perf_event *event, *tmp;
>> +		struct pmu *pmu = pmu_ctx->pmu;
>> +
>> +		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
>> +			continue;
>> +
>> +		perf_pmu_disable(pmu);
>> +
>> +		/*
>> +		 * All active events must be exclude_guest events.
>> +		 * See perf_get_mediated_pmu().
>> +		 * Unconditionally remove all active events.
>> +		 */
>> +		list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list)
>> +			group_sched_out(event, pmu_ctx->ctx);
>> +
>> +		list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list)
>> +			group_sched_out(event, pmu_ctx->ctx);
>> +
>> +		pmu_ctx->rotate_necessary = 0;
>> +
>> +		perf_pmu_enable(pmu);
>> +	}
>> +}
>> +
>> +/* When entering a guest, schedule out all exclude_guest events. */
>> +void perf_guest_enter(void)
>> +{
>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>> +
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
>> +		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +		return;
>> +	}
>> +
>> +	perf_sched_out_exclude_guest(&cpuctx->ctx);
>> +	if (cpuctx->task_ctx)
>> +		perf_sched_out_exclude_guest(cpuctx->task_ctx);
>> +
>> +	__this_cpu_write(perf_in_guest, true);
>> +
>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +}
>> +
>> +static void perf_sched_in_exclude_guest(struct perf_event_context *ctx)
>> +{
>> +	struct perf_event_pmu_context *pmu_ctx;
>> +
>> +	update_context_time(ctx);
>> +	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>> +		struct pmu *pmu = pmu_ctx->pmu;
>> +
>> +		if (!(pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
>> +			continue;
>> +
>> +		perf_pmu_disable(pmu);
>> +		pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu);
>> +		pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
>> +		perf_pmu_enable(pmu);
>> +	}
>> +}
>> +
>> +void perf_guest_exit(void)
>> +{
>> +	struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
>> +
>> +	lockdep_assert_irqs_disabled();
>> +
>> +	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>> +
>> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
>> +		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +		return;
>> +	}
>> +
>> +	__this_cpu_write(perf_in_guest, false);
>> +
>> +	perf_sched_in_exclude_guest(&cpuctx->ctx);
>> +	if (cpuctx->task_ctx)
>> +		perf_sched_in_exclude_guest(cpuctx->task_ctx);
>> +
>> +	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> +}
> 
> Bah, this is a ton of copy-paste from the normal scheduling code with
> random changes. Why ?
> 
> Why can't this use ctx_sched_{in,out}() ? Surely the whole
> CAP_PASSTHROUGHT thing is but a flag away.
>

Not just a flag. The time has to be updated as well, since the ctx->time
is shared among PMUs. Perf shouldn't stop it while other PMUs is still
running.

A timeguest will be introduced to track the start time of a guest.
The event->tstamp of an exclude_guest event should always keep
ctx->timeguest while a guest is running.
When a guest is leaving, update the event->tstamp to now, so the guest
time can be deducted.

The below patch demonstrate how the timeguest works.
(It's an incomplete patch. Just to show the idea.)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 22d3e56682c9..2134e6886e22 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -953,6 +953,7 @@ struct perf_event_context {
 	u64				time;
 	u64				timestamp;
 	u64				timeoffset;
+	u64				timeguest;

 	/*
 	 * These fields let us detect when two contexts have both
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 14fd881e3e1d..2aed56671a24 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -690,12 +690,31 @@ __perf_update_times(struct perf_event *event, u64
now, u64 *enabled, u64 *runnin
 		*running += delta;
 }

+static void perf_event_update_time_guest(struct perf_event *event)
+{
+	/*
+	 * If a guest is running, use the timestamp while entering the guest.
+	 * If the guest is leaving, reset the event timestamp.
+	 */
+	if (!__this_cpu_read(perf_in_guest))
+		event->tstamp = event->ctx->time;
+	else
+		event->tstamp = event->ctx->timeguest;
+}
+
 static void perf_event_update_time(struct perf_event *event)
 {
-	u64 now = perf_event_time(event);
+	u64 now;
+
+	/* Never count the time of an active guest into an exclude_guest event. */
+	if (event->ctx->timeguest &&
+	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU)
+		return perf_event_update_time_guest(event);

+	now = perf_event_time(event);
 	__perf_update_times(event, now, &event->total_time_enabled,
 					&event->total_time_running);
+
 	event->tstamp = now;
 }

@@ -3398,7 +3417,14 @@ ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
 			cpuctx->task_ctx = NULL;
 	}

-	is_active ^= ctx->is_active; /* changed bits */
+	if (event_type & EVENT_GUEST) {
+		/*
+		 * Schedule out all !exclude_guest events of PMU
+		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
+		 */
+		is_active = EVENT_ALL;
+	} else
+		is_active ^= ctx->is_active; /* changed bits */

 	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
 		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
@@ -3998,7 +4024,20 @@ ctx_sched_in(struct perf_event_context *ctx, enum
event_type_t event_type)
 			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
 	}

-	is_active ^= ctx->is_active; /* changed bits */
+	if (event_type & EVENT_GUEST) {
+		/*
+		 * Schedule in all !exclude_guest events of PMU
+		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
+		 */
+		is_active = EVENT_ALL;
+
+		/*
+		 * Update ctx time to set the new start time for
+		 * the exclude_guest events.
+		 */
+		update_context_time(ctx);
+	} else
+		is_active ^= ctx->is_active; /* changed bits */

 	/*
 	 * First go through the list and put on any pinned groups

@@ -5894,14 +5933,18 @@ void perf_guest_enter(u32 guest_lvtpc)
 	}

 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
-	ctx_sched_out(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST);
+	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
+	/* Set the guest start time */
+	cpuctx->ctx.timeguest = cpuctx->ctx.time;
 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
 	if (cpuctx->task_ctx) {
 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
-		task_ctx_sched_out(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST);
+		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
+		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
 	}

 	__this_cpu_write(perf_in_guest, true);
@@ -5925,14 +5968,17 @@ void perf_guest_exit(void)

 	__this_cpu_write(perf_in_guest, false);

 	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
-	ctx_sched_in(&cpuctx->ctx, EVENT_ALL | EVENT_GUEST);
+	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
+	cpuctx->ctx.timeguest = 0;
 	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
 	if (cpuctx->task_ctx) {
 		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
-		ctx_sched_in(cpuctx->task_ctx, EVENT_ALL | EVENT_GUEST);
+		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
+		cpuctx->task_ctx->timeguest = 0;
 		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
 	}

Thanks
Kan