Re: [PATCH v2 07/54] perf: Add generic exclude_guest support

"Liang, Kan" <kan.liang@xxxxxxxxxxxxxxx> · Thu, 13 Jun 2024 14:04:36 -0400

On 2024-06-13 9:37 a.m., Liang, Kan wrote:
>> ---
>>
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -947,7 +947,9 @@ struct perf_event_context {
>>  	u64				time;
>>  	u64				timestamp;
>>  	u64				timeoffset;
>> -	u64				timeguest;
>> +	u64				guest_time;
>> +	u64				guest_timestamp;
>> +	u64				guest_timeoffset;
>>  
>>  	/*
>>  	 * These fields let us detect when two contexts have both
>> @@ -1043,6 +1045,9 @@ struct perf_cgroup_info {
>>  	u64				time;
>>  	u64				timestamp;
>>  	u64				timeoffset;
>> +	u64				guest_time;
>> +	u64				guest_timestamp;
>> +	u64				guest_timeoffset;
>>  	int				active;
>>  };
>>  
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -638,26 +638,9 @@ __perf_update_times(struct perf_event *e
>>  
>>  static void perf_event_update_time(struct perf_event *event)
>>  {
>> -	u64 now;
>> -
>> -	/* Never count the time of an active guest into an exclude_guest event. */
>> -	if (event->ctx->timeguest &&
>> -	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) {
>> -		/*
>> -		 * If a guest is running, use the timestamp while entering the guest.
>> -		 * If the guest is leaving, reset the event timestamp.
>> -		 */
>> -		if (__this_cpu_read(perf_in_guest))
>> -			event->tstamp = event->ctx->timeguest;
>> -		else
>> -			event->tstamp = event->ctx->time;
>> -		return;
>> -	}
>> -
>> -	now = perf_event_time(event);
>> +	u64 now = perf_event_time(event);
>>  	__perf_update_times(event, now, &event->total_time_enabled,
>>  					&event->total_time_running);
>> -
>>  	event->tstamp = now;
>>  }
>>  
>> @@ -780,19 +763,33 @@ static inline int is_cgroup_event(struct
>>  static inline u64 perf_cgroup_event_time(struct perf_event *event)
>>  {
>>  	struct perf_cgroup_info *t;
>> +	u64 time;
>>  
>>  	t = per_cpu_ptr(event->cgrp->info, event->cpu);
>> -	return t->time;
>> +	time = t->time;
>> +	if (event->attr.exclude_guest)
>> +		time -= t->guest_time;
>> +	return time;
>>  }
>>  
>>  static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
>>  {
>>  	struct perf_cgroup_info *t;
>> +	u64 time, guest_time;
>>  
>>  	t = per_cpu_ptr(event->cgrp->info, event->cpu);
>> -	if (!__load_acquire(&t->active))
>> -		return t->time;
>> -	now += READ_ONCE(t->timeoffset);
>> +	if (!__load_acquire(&t->active)) {
>> +		time = t->time;
>> +		if (event->attr.exclude_guest)
>> +			time -= t->guest_time;
>> +		return time;
>> +	}
>> +
>> +	time = now + READ_ONCE(t->timeoffset);
>> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) {
>> +		guest_time = now + READ_ONCE(t->guest_offset);
>> +		time -= guest_time;
>> +	}
>>  	return now;
>>  }
>>  
>> @@ -807,6 +804,17 @@ static inline void __update_cgrp_time(st
>>  	WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
>>  }
>>  
>> +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
>> +{
>> +	if (adv)
>> +		info->guest_time += now - info->guest_timestamp;
>> +	info->guest_timestamp = now;
>> +	/*
>> +	 * see update_context_time()
>> +	 */
>> +	WRITE_ONCE(info->guest_timeoffset, info->guest_time - info->guest_timestamp);
>> +}
>> +
>>  static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
>>  {
>>  	struct perf_cgroup *cgrp = cpuctx->cgrp;
>> @@ -821,6 +829,8 @@ static inline void update_cgrp_time_from
>>  			info = this_cpu_ptr(cgrp->info);
>>  
>>  			__update_cgrp_time(info, now, true);
>> +			if (__this_cpu_read(perf_in_guest))
>> +				__update_cgrp_guest_time(info, now, true);
>>  			if (final)
>>  				__store_release(&info->active, 0);
>>  		}
>> @@ -1501,14 +1511,39 @@ static void __update_context_time(struct
>>  	WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
>>  }
>>  
>> +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
>> +{
>> +	u64 now = ctx->timestamp; /* must be called after __update_context_time(); */
>> +
>> +	lockdep_assert_held(&ctx->lock);
>> +
>> +	if (adv)
>> +		ctx->guest_time += now - ctx->guest_timestamp;
>> +	ctx->guest_timestamp = now;
>> +
>> +	/*
>> +	 * The above: time' = time + (now - timestamp), can be re-arranged
>> +	 * into: time` = now + (time - timestamp), which gives a single value
>> +	 * offset to compute future time without locks on.
>> +	 *
>> +	 * See perf_event_time_now(), which can be used from NMI context where
>> +	 * it's (obviously) not possible to acquire ctx->lock in order to read
>> +	 * both the above values in a consistent manner.
>> +	 */
>> +	WRITE_ONCE(ctx->guest_timeoffset, ctx->guest_time - ctx->guest_timestamp);
>> +}
>> +
>>  static void update_context_time(struct perf_event_context *ctx)
>>  {
>>  	__update_context_time(ctx, true);
>> +	if (__this_cpu_read(perf_in_guest))
>> +		__update_context_guest_time(ctx, true);
>>  }
>>  
>>  static u64 perf_event_time(struct perf_event *event)
>>  {
>>  	struct perf_event_context *ctx = event->ctx;
>> +	u64 time;
>>  
>>  	if (unlikely(!ctx))
>>  		return 0;
>> @@ -1516,12 +1551,17 @@ static u64 perf_event_time(struct perf_e
>>  	if (is_cgroup_event(event))
>>  		return perf_cgroup_event_time(event);
>>  
>> -	return ctx->time;
>> +	time = ctx->time;
>> +	if (event->attr.exclude_guest)
>> +		time -= ctx->guest_time;
>> +
>> +	return time;
>>  }
>>  
>>  static u64 perf_event_time_now(struct perf_event *event, u64 now)
>>  {
>>  	struct perf_event_context *ctx = event->ctx;
>> +	u64 time, guest_time;
>>  
>>  	if (unlikely(!ctx))
>>  		return 0;
>> @@ -1529,11 +1569,19 @@ static u64 perf_event_time_now(struct pe
>>  	if (is_cgroup_event(event))
>>  		return perf_cgroup_event_time_now(event, now);
>>  
>> -	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
>> -		return ctx->time;
>> +	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) {
>> +		time = ctx->time;
>> +		if (event->attr.exclude_guest)
>> +			time -= ctx->guest_time;
>> +		return time;
>> +	}
>>  
>> -	now += READ_ONCE(ctx->timeoffset);
>> -	return now;
>> +	time = now + READ_ONCE(ctx->timeoffset);
>> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest)) {
>> +		guest_time = now + READ_ONCE(ctx->guest_timeoffset);
>> +		time -= guest_time;
>> +	}
>> +	return time;
>>  }
>>  
>>  static enum event_type_t get_event_type(struct perf_event *event)
>> @@ -3340,9 +3388,14 @@ ctx_sched_out(struct perf_event_context
>>  	 * would only update time for the pinned events.
>>  	 */
>>  	if (is_active & EVENT_TIME) {
>> +		bool stop;
>> +
>> +		stop = !((ctx->is_active & event_type) & EVENT_ALL) &&
>> +		       ctx == &cpuctx->ctx;
>> +			
>>  		/* update (and stop) ctx time */
>>  		update_context_time(ctx);
>> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
>> +		update_cgrp_time_from_cpuctx(cpuctx, stop);

For the event_type == EVENT_GUEST, the "stop" should always be the same
as "ctx == &cpuctx->ctx". Because the ctx->is_active never set the
EVENT_GUEST bit.
Why the stop is introduced?

>>  		/*
>>  		 * CPU-release for the below ->is_active store,
>>  		 * see __load_acquire() in perf_event_time_now()
>> @@ -3366,8 +3419,12 @@ ctx_sched_out(struct perf_event_context
>>  		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
>>  		 */
>>  		is_active = EVENT_ALL;
>> -	} else
>> +		__update_context_guest_time(ctx, false);
>> +		perf_cgroup_set_guest_timestamp(cpuctx);
>> +		barrier();
>> +	} else {
>>  		is_active ^= ctx->is_active; /* changed bits */
>> +	}
>>  
>>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
>> @@ -3866,10 +3923,15 @@ static inline void group_update_userpage
>>  		event_update_userpage(event);
>>  }
>>  
>> +struct merge_sched_data {
>> +	int can_add_hw;
>> +	enum event_type_t event_type;
>> +};
>> +
>>  static int merge_sched_in(struct perf_event *event, void *data)
>>  {
>>  	struct perf_event_context *ctx = event->ctx;
>> -	int *can_add_hw = data;
>> +	struct merge_sched_data *msd = data;
>>  
>>  	if (event->state <= PERF_EVENT_STATE_OFF)
>>  		return 0;
>> @@ -3881,18 +3943,18 @@ static int merge_sched_in(struct perf_ev
>>  	 * Don't schedule in any exclude_guest events of PMU with
>>  	 * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running.
>>  	 */
>> -	if (__this_cpu_read(perf_in_guest) &&
>> -	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU &&
>> -	    event->attr.exclude_guest)
>> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest) &&
>> +	    (event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) &&
>> +	    !(msd->event_type & EVENT_GUEST))
>>  		return 0;
>>  
>> -	if (group_can_go_on(event, *can_add_hw)) {
>> +	if (group_can_go_on(event, msd->can_add_hw)) {
>>  		if (!group_sched_in(event, ctx))
>>  			list_add_tail(&event->active_list, get_event_list(event));
>>  	}
>>  
>>  	if (event->state == PERF_EVENT_STATE_INACTIVE) {
>> -		*can_add_hw = 0;
>> +		msd->can_add_hw = 0;
>>  		if (event->attr.pinned) {
>>  			perf_cgroup_event_disable(event, ctx);
>>  			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
>> @@ -3911,11 +3973,15 @@ static int merge_sched_in(struct perf_ev
>>  
>>  static void pmu_groups_sched_in(struct perf_event_context *ctx,
>>  				struct perf_event_groups *groups,
>> -				struct pmu *pmu)
>> +				struct pmu *pmu,
>> +				enum even_type_t event_type)
>>  {
>> -	int can_add_hw = 1;
>> +	struct merge_sched_data msd = {
>> +		.can_add_hw = 1,
>> +		.event_type = event_type,
>> +	};
>>  	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
>> -			   merge_sched_in, &can_add_hw);
>> +			   merge_sched_in, &msd);
>>  }
>>  
>>  static void ctx_groups_sched_in(struct perf_event_context *ctx,
>> @@ -3927,14 +3993,14 @@ static void ctx_groups_sched_in(struct p
>>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
>>  			continue;
>> -		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
>> +		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type);
>>  	}
>>  }
>>  
>>  static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
>>  			       struct pmu *pmu)
>>  {
>> -	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
>> +	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0);
>>  }
>>  
>>  static void
>> @@ -3949,6 +4015,8 @@ ctx_sched_in(struct perf_event_context *
>>  		return;
>>  
>>  	if (!(is_active & EVENT_TIME)) {
>> +		/* EVENT_TIME should be active while the guest runs */
>> +		WARN_ON_ONCE(event_type & EVENT_GUEST);
>>  		/* start ctx time */
>>  		__update_context_time(ctx, false);
>>  		perf_cgroup_set_timestamp(cpuctx);
>> @@ -3979,8 +4047,11 @@ ctx_sched_in(struct perf_event_context *
>>  		 * the exclude_guest events.
>>  		 */
>>  		update_context_time(ctx);
>> -	} else
>> +		update_cgrp_time_from_cpuctx(cpuctx, false);

In the above ctx_sched_out(), the cgrp_time is stopped and the cgrp has
been set to inactive.
I think we need a perf_cgroup_set_timestamp(cpuctx) here to restart the
cgrp_time, Right?

Also, I think the cgrp_time is different from the normal ctx->time. When
a guest is running, there must be no cgroup. It's OK to disable the
cgrp_time. If so, I don't think we need to track the guest_time for the
cgrp.

Thanks,
Kan

>> +		barrier();
>> +	} else {
>>  		is_active ^= ctx->is_active; /* changed bits */
>> +	}
>>  
>>  	/*
>>  	 * First go through the list and put on any pinned groups
>> @@ -5832,25 +5903,20 @@ void perf_guest_enter(void)
>>  
>>  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>  
>> -	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest))) {
>> -		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> -		return;
>> -	}
>> +	if (WARN_ON_ONCE(__this_cpu_read(perf_in_guest)))
>> +		goto unlock;
>>  
>>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>  	ctx_sched_out(&cpuctx->ctx, EVENT_GUEST);
>> -	/* Set the guest start time */
>> -	cpuctx->ctx.timeguest = cpuctx->ctx.time;
>>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>  	if (cpuctx->task_ctx) {
>>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>  		task_ctx_sched_out(cpuctx->task_ctx, EVENT_GUEST);
>> -		cpuctx->task_ctx->timeguest = cpuctx->task_ctx->time;
>>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>  	}
>>  
>>  	__this_cpu_write(perf_in_guest, true);
>> -
>> +unlock:
>>  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>  }
>>  
>> @@ -5862,24 +5928,21 @@ void perf_guest_exit(void)
>>  
>>  	perf_ctx_lock(cpuctx, cpuctx->task_ctx);
>>  
>> -	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest))) {
>> -		perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>> -		return;
>> -	}
>> -
>> -	__this_cpu_write(perf_in_guest, false);
>> +	if (WARN_ON_ONCE(!__this_cpu_read(perf_in_guest)))
>> +		goto unlock;
>>  
>>  	perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST);
>>  	ctx_sched_in(&cpuctx->ctx, EVENT_GUEST);
>> -	cpuctx->ctx.timeguest = 0;
>>  	perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST);
>>  	if (cpuctx->task_ctx) {
>>  		perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST);
>>  		ctx_sched_in(cpuctx->task_ctx, EVENT_GUEST);
>> -		cpuctx->task_ctx->timeguest = 0;
>>  		perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST);
>>  	}
>>  
>> +	__this_cpu_write(perf_in_guest, false);
>> +
>> +unlock:
>>  	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
>>  }
>>