Re: [RFC PATCH v3 09/58] perf: Add a EVENT_GUEST flag

Sandipan Das <sandipan.das@xxxxxxx> · Fri, 13 Dec 2024 15:07:38 +0530

On 8/1/2024 10:28 AM, Mingwei Zhang wrote:
> From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
> 
> Current perf doesn't explicitly schedule out all exclude_guest events
> while the guest is running. There is no problem with the current
> emulated vPMU. Because perf owns all the PMU counters. It can mask the
> counter which is assigned to an exclude_guest event when a guest is
> running (Intel way), or set the corresponding HOSTONLY bit in evsentsel
> (AMD way). The counter doesn't count when a guest is running.
> 
> However, either way doesn't work with the introduced passthrough vPMU.
> A guest owns all the PMU counters when it's running. The host should not
> mask any counters. The counter may be used by the guest. The evsentsel
> may be overwritten.
> 
> Perf should explicitly schedule out all exclude_guest events to release
> the PMU resources when entering a guest, and resume the counting when
> exiting the guest.
> 
> It's possible that an exclude_guest event is created when a guest is
> running. The new event should not be scheduled in as well.
> 
> The ctx time is shared among different PMUs. The time cannot be stopped
> when a guest is running. It is required to calculate the time for events
> from other PMUs, e.g., uncore events. Add timeguest to track the guest
> run time. For an exclude_guest event, the elapsed time equals
> the ctx time - guest time.
> Cgroup has dedicated times. Use the same method to deduct the guest time
> from the cgroup time as well.
> 
> Co-developed-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
> Signed-off-by: Mingwei Zhang <mizhang@xxxxxxxxxx>
> ---
>  include/linux/perf_event.h |   6 ++
>  kernel/events/core.c       | 178 +++++++++++++++++++++++++++++++------
>  2 files changed, 155 insertions(+), 29 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index e22cdb6486e6..81a5f8399cb8 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -952,6 +952,11 @@ struct perf_event_context {
>  	 */
>  	struct perf_time_ctx		time;
>  
> +	/*
> +	 * Context clock, runs when in the guest mode.
> +	 */
> +	struct perf_time_ctx		timeguest;
> +
>  	/*
>  	 * These fields let us detect when two contexts have both
>  	 * been cloned (inherited) from a common ancestor.
> @@ -1044,6 +1049,7 @@ struct bpf_perf_event_data_kern {
>   */
>  struct perf_cgroup_info {
>  	struct perf_time_ctx		time;
> +	struct perf_time_ctx		timeguest;
>  	int				active;
>  };
>  
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index c25e2bf27001..57648736e43e 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -376,7 +376,8 @@ enum event_type_t {
>  	/* see ctx_resched() for details */
>  	EVENT_CPU = 0x8,
>  	EVENT_CGROUP = 0x10,
> -	EVENT_FLAGS = EVENT_CGROUP,
> +	EVENT_GUEST = 0x20,
> +	EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST,
>  	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
>  };
>  
> @@ -407,6 +408,7 @@ static atomic_t nr_include_guest_events __read_mostly;
>  
>  static atomic_t nr_mediated_pmu_vms;
>  static DEFINE_MUTEX(perf_mediated_pmu_mutex);
> +static DEFINE_PER_CPU(bool, perf_in_guest);
>  
>  /* !exclude_guest event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */
>  static inline bool is_include_guest_event(struct perf_event *event)
> @@ -706,6 +708,10 @@ static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx,
>  	if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups)
>  		return true;
>  
> +	if ((event_type & EVENT_GUEST) &&
> +	    !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU))
> +		return true;
> +
>  	return false;
>  }
>  
> @@ -770,12 +776,21 @@ static inline int is_cgroup_event(struct perf_event *event)
>  	return event->cgrp != NULL;
>  }
>  
> +static inline u64 __perf_event_time_ctx(struct perf_event *event,
> +					struct perf_time_ctx *time,
> +					struct perf_time_ctx *timeguest);
> +
> +static inline u64 __perf_event_time_ctx_now(struct perf_event *event,
> +					    struct perf_time_ctx *time,
> +					    struct perf_time_ctx *timeguest,
> +					    u64 now);
> +
>  static inline u64 perf_cgroup_event_time(struct perf_event *event)
>  {
>  	struct perf_cgroup_info *t;
>  
>  	t = per_cpu_ptr(event->cgrp->info, event->cpu);
> -	return t->time.time;
> +	return __perf_event_time_ctx(event, &t->time, &t->timeguest);
>  }
>  
>  static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
> @@ -784,9 +799,9 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
>  
>  	t = per_cpu_ptr(event->cgrp->info, event->cpu);
>  	if (!__load_acquire(&t->active))
> -		return t->time.time;
> -	now += READ_ONCE(t->time.offset);
> -	return now;
> +		return __perf_event_time_ctx(event, &t->time, &t->timeguest);
> +
> +	return __perf_event_time_ctx_now(event, &t->time, &t->timeguest, now);
>  }
>  
>  static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv);
> @@ -796,6 +811,18 @@ static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bo
>  	update_perf_time_ctx(&info->time, now, adv);
>  }
>  
> +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv)
> +{
> +	update_perf_time_ctx(&info->timeguest, now, adv);
> +}
> +
> +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now)
> +{
> +	__update_cgrp_time(info, now, true);
> +	if (__this_cpu_read(perf_in_guest))
> +		__update_cgrp_guest_time(info, now, true);
> +}
> +
>  static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
>  {
>  	struct perf_cgroup *cgrp = cpuctx->cgrp;
> @@ -809,7 +836,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
>  			cgrp = container_of(css, struct perf_cgroup, css);
>  			info = this_cpu_ptr(cgrp->info);
>  
> -			__update_cgrp_time(info, now, true);
> +			update_cgrp_time(info, now);
>  			if (final)
>  				__store_release(&info->active, 0);
>  		}
> @@ -832,11 +859,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
>  	 * Do not update time when cgroup is not active
>  	 */
>  	if (info->active)
> -		__update_cgrp_time(info, perf_clock(), true);
> +		update_cgrp_time(info, perf_clock());
>  }
>  
>  static inline void
> -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
> +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
>  {
>  	struct perf_event_context *ctx = &cpuctx->ctx;
>  	struct perf_cgroup *cgrp = cpuctx->cgrp;
> @@ -856,8 +883,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
>  	for (css = &cgrp->css; css; css = css->parent) {
>  		cgrp = container_of(css, struct perf_cgroup, css);
>  		info = this_cpu_ptr(cgrp->info);
> -		__update_cgrp_time(info, ctx->time.stamp, false);
> -		__store_release(&info->active, 1);
> +		if (guest) {
> +			__update_cgrp_guest_time(info, ctx->time.stamp, false);
> +		} else {
> +			__update_cgrp_time(info, ctx->time.stamp, false);
> +			__store_release(&info->active, 1);
> +		}
>  	}
>  }
>  
> @@ -1061,7 +1092,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
>  }
>  
>  static inline void
> -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
> +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest)
>  {
>  }
>  
> @@ -1488,16 +1519,34 @@ static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, boo
>   */
>  static void __update_context_time(struct perf_event_context *ctx, bool adv)
>  {
> -	u64 now = perf_clock();
> +	lockdep_assert_held(&ctx->lock);
> +
> +	update_perf_time_ctx(&ctx->time, perf_clock(), adv);
> +}
>  
> +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv)
> +{
>  	lockdep_assert_held(&ctx->lock);
>  
> -	update_perf_time_ctx(&ctx->time, now, adv);
> +	/* must be called after __update_context_time(); */
> +	update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv);
>  }
>  
>  static void update_context_time(struct perf_event_context *ctx)
>  {
>  	__update_context_time(ctx, true);
> +	if (__this_cpu_read(perf_in_guest))
> +		__update_context_guest_time(ctx, true);
> +}
> +
> +static inline u64 __perf_event_time_ctx(struct perf_event *event,
> +					struct perf_time_ctx *time,
> +					struct perf_time_ctx *timeguest)
> +{
> +	if (event->attr.exclude_guest)
> +		return time->time - timeguest->time;
> +	else
> +		return time->time;
>  }
>  
>  static u64 perf_event_time(struct perf_event *event)
> @@ -1510,7 +1559,26 @@ static u64 perf_event_time(struct perf_event *event)
>  	if (is_cgroup_event(event))
>  		return perf_cgroup_event_time(event);
>  
> -	return ctx->time.time;
> +	return __perf_event_time_ctx(event, &ctx->time, &ctx->timeguest);
> +}
> +
> +static inline u64 __perf_event_time_ctx_now(struct perf_event *event,
> +					    struct perf_time_ctx *time,
> +					    struct perf_time_ctx *timeguest,
> +					    u64 now)
> +{
> +	/*
> +	 * The exclude_guest event time should be calculated from
> +	 * the ctx time -  the guest time.
> +	 * The ctx time is now + READ_ONCE(time->offset).
> +	 * The guest time is now + READ_ONCE(timeguest->offset).
> +	 * So the exclude_guest time is
> +	 * READ_ONCE(time->offset) - READ_ONCE(timeguest->offset).
> +	 */
> +	if (event->attr.exclude_guest && __this_cpu_read(perf_in_guest))
> +		return READ_ONCE(time->offset) - READ_ONCE(timeguest->offset);
> +	else
> +		return now + READ_ONCE(time->offset);
>  }
>  
>  static u64 perf_event_time_now(struct perf_event *event, u64 now)
> @@ -1524,10 +1592,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now)
>  		return perf_cgroup_event_time_now(event, now);
>  
>  	if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
> -		return ctx->time.time;
> +		return __perf_event_time_ctx(event, &ctx->time, &ctx->timeguest);
>  
> -	now += READ_ONCE(ctx->time.offset);
> -	return now;
> +	return __perf_event_time_ctx_now(event, &ctx->time, &ctx->timeguest, now);
>  }
>  
>  static enum event_type_t get_event_type(struct perf_event *event)
> @@ -3334,9 +3401,15 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
>  	 * would only update time for the pinned events.
>  	 */
>  	if (is_active & EVENT_TIME) {
> +		bool stop;
> +
> +		/* vPMU should not stop time */
> +		stop = !(event_type & EVENT_GUEST) &&
> +		       ctx == &cpuctx->ctx;
> +
>  		/* update (and stop) ctx time */
>  		update_context_time(ctx);
> -		update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
> +		update_cgrp_time_from_cpuctx(cpuctx, stop);
>  		/*
>  		 * CPU-release for the below ->is_active store,
>  		 * see __load_acquire() in perf_event_time_now()
> @@ -3354,7 +3427,18 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
>  			cpuctx->task_ctx = NULL;
>  	}
>  
> -	is_active ^= ctx->is_active; /* changed bits */
> +	if (event_type & EVENT_GUEST) {
> +		/*
> +		 * Schedule out all !exclude_guest events of PMU
> +		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
> +		 */
> +		is_active = EVENT_ALL;
> +		__update_context_guest_time(ctx, false);
> +		perf_cgroup_set_timestamp(cpuctx, true);
> +		barrier();
> +	} else {
> +		is_active ^= ctx->is_active; /* changed bits */
> +	}
>  
>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
> @@ -3853,10 +3937,15 @@ static inline void group_update_userpage(struct perf_event *group_event)
>  		event_update_userpage(event);
>  }
>  
> +struct merge_sched_data {
> +	int can_add_hw;
> +	enum event_type_t event_type;
> +};
> +
>  static int merge_sched_in(struct perf_event *event, void *data)
>  {
>  	struct perf_event_context *ctx = event->ctx;
> -	int *can_add_hw = data;
> +	struct merge_sched_data *msd = data;
>  
>  	if (event->state <= PERF_EVENT_STATE_OFF)
>  		return 0;
> @@ -3864,13 +3953,22 @@ static int merge_sched_in(struct perf_event *event, void *data)
>  	if (!event_filter_match(event))
>  		return 0;
>  
> -	if (group_can_go_on(event, *can_add_hw)) {
> +	/*
> +	 * Don't schedule in any exclude_guest events of PMU with
> +	 * PERF_PMU_CAP_PASSTHROUGH_VPMU, while a guest is running.
> +	 */
> +	if (__this_cpu_read(perf_in_guest) && event->attr.exclude_guest &&
> +	    event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU &&
> +	    !(msd->event_type & EVENT_GUEST))
> +		return 0;
> +

It is possible for event groups to have a mix of software and core PMU events.
If the group leader is a software event, event->pmu will point to the software
PMU but event->pmu_ctx->pmu will point to the core PMU. When perf_in_guest is
true for a CPU and the group leader is passed to merge_sched_in(), the condition
above fails as the software PMU does not have PERF_PMU_CAP_PASSTHROUGH_VPMU
capability. This can lead to group_sched_in() getting called later where all
the sibling events, which includes core PMU events that are not supposed to
be scheduled in, to be brought in. So event->pmu_ctx->pmu->capabilities needs
to be looked at instead.

> +	if (group_can_go_on(event, msd->can_add_hw)) {
>  		if (!group_sched_in(event, ctx))
>  			list_add_tail(&event->active_list, get_event_list(event));
>  	}
>  
>  	if (event->state == PERF_EVENT_STATE_INACTIVE) {
> -		*can_add_hw = 0;
> +		msd->can_add_hw = 0;
>  		if (event->attr.pinned) {
>  			perf_cgroup_event_disable(event, ctx);
>  			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
> @@ -3889,11 +3987,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
>  
>  static void pmu_groups_sched_in(struct perf_event_context *ctx,
>  				struct perf_event_groups *groups,
> -				struct pmu *pmu)
> +				struct pmu *pmu,
> +				enum event_type_t event_type)
>  {
> -	int can_add_hw = 1;
> +	struct merge_sched_data msd = {
> +		.can_add_hw = 1,
> +		.event_type = event_type,
> +	};
>  	visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
> -			   merge_sched_in, &can_add_hw);
> +			   merge_sched_in, &msd);
>  }
>  
>  static void ctx_groups_sched_in(struct perf_event_context *ctx,
> @@ -3905,14 +4007,14 @@ static void ctx_groups_sched_in(struct perf_event_context *ctx,
>  	list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
>  		if (perf_skip_pmu_ctx(pmu_ctx, event_type))
>  			continue;
> -		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
> +		pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu, event_type);
>  	}
>  }
>  
>  static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
>  			       struct pmu *pmu)
>  {
> -	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
> +	pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu, 0);
>  }
>  
>  static void
> @@ -3927,9 +4029,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
>  		return;
>  
>  	if (!(is_active & EVENT_TIME)) {
> +		/* EVENT_TIME should be active while the guest runs */
> +		WARN_ON_ONCE(event_type & EVENT_GUEST);
>  		/* start ctx time */
>  		__update_context_time(ctx, false);
> -		perf_cgroup_set_timestamp(cpuctx);
> +		perf_cgroup_set_timestamp(cpuctx, false);
>  		/*
>  		 * CPU-release for the below ->is_active store,
>  		 * see __load_acquire() in perf_event_time_now()
> @@ -3945,7 +4049,23 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
>  			WARN_ON_ONCE(cpuctx->task_ctx != ctx);
>  	}
>  
> -	is_active ^= ctx->is_active; /* changed bits */
> +	if (event_type & EVENT_GUEST) {
> +		/*
> +		 * Schedule in all !exclude_guest events of PMU
> +		 * with PERF_PMU_CAP_PASSTHROUGH_VPMU.
> +		 */
> +		is_active = EVENT_ALL;
> +
> +		/*
> +		 * Update ctx time to set the new start time for
> +		 * the exclude_guest events.
> +		 */
> +		update_context_time(ctx);
> +		update_cgrp_time_from_cpuctx(cpuctx, false);
> +		barrier();
> +	} else {
> +		is_active ^= ctx->is_active; /* changed bits */
> +	}
>  
>  	/*
>  	 * First go through the list and put on any pinned groups