Re: [RFC PATCH v3 06/58] perf: Support get/put passthrough PMU interfaces

"Mi, Dapeng" <dapeng1.mi@xxxxxxxxxxxxxxx> · Fri, 6 Sep 2024 18:59:09 +0800

On 8/1/2024 12:58 PM, Mingwei Zhang wrote:
> From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
>
> Currently, the guest and host share the PMU resources when a guest is
> running. KVM has to create an extra virtual event to simulate the
> guest's event, which brings several issues, e.g., high overhead, not
> accuracy and etc.
>
> A new passthrough PMU method is proposed to address the issue. It requires
> that the PMU resources can be fully occupied by the guest while it's
> running. Two new interfaces are implemented to fulfill the requirement.
> The hypervisor should invoke the interface while creating a guest which
> wants the passthrough PMU capability.
>
> The PMU resources should only be temporarily occupied as a whole when a
> guest is running. When the guest is out, the PMU resources are still
> shared among different users.
>
> The exclude_guest event modifier is used to guarantee the exclusive
> occupation of the PMU resources. When creating a guest, the hypervisor
> should check whether there are !exclude_guest events in the system.
> If yes, the creation should fail. Because some PMU resources have been
> occupied by other users.
> If no, the PMU resources can be safely accessed by the guest directly.
> Perf guarantees that no new !exclude_guest events are created when a
> guest is running.
>
> Only the passthrough PMU is affected, but not for other PMU e.g., uncore
> and SW PMU. The behavior of those PMUs are not changed. The guest
> enter/exit interfaces should only impact the supported PMUs.
> Add a new PERF_PMU_CAP_PASSTHROUGH_VPMU flag to indicate the PMUs that
> support the feature.
>
> Add nr_include_guest_events to track the !exclude_guest events of PMU
> with PERF_PMU_CAP_PASSTHROUGH_VPMU.
>
> Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
> Tested-by: Yongwei Ma <yongwei.ma@xxxxxxxxx>
> Signed-off-by: Mingwei Zhang <mizhang@xxxxxxxxxx>
> ---
>  include/linux/perf_event.h | 10 ++++++
>  kernel/events/core.c       | 66 ++++++++++++++++++++++++++++++++++++++
>  2 files changed, 76 insertions(+)
>
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index a5304ae8c654..45d1ea82aa21 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -291,6 +291,7 @@ struct perf_event_pmu_context;
>  #define PERF_PMU_CAP_NO_EXCLUDE			0x0040
>  #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
>  #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
> +#define PERF_PMU_CAP_PASSTHROUGH_VPMU		0x0200
>  
>  struct perf_output_handle;
>  
> @@ -1728,6 +1729,8 @@ extern void perf_event_task_tick(void);
>  extern int perf_event_account_interrupt(struct perf_event *event);
>  extern int perf_event_period(struct perf_event *event, u64 value);
>  extern u64 perf_event_pause(struct perf_event *event, bool reset);
> +int perf_get_mediated_pmu(void);
> +void perf_put_mediated_pmu(void);
>  #else /* !CONFIG_PERF_EVENTS: */
>  static inline void *
>  perf_aux_output_begin(struct perf_output_handle *handle,
> @@ -1814,6 +1817,13 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset)
>  {
>  	return 0;
>  }
> +
> +static inline int perf_get_mediated_pmu(void)
> +{
> +	return 0;
> +}
> +
> +static inline void perf_put_mediated_pmu(void)			{ }
>  #endif
>  
>  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 8f908f077935..45868d276cde 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -402,6 +402,20 @@ static atomic_t nr_bpf_events __read_mostly;
>  static atomic_t nr_cgroup_events __read_mostly;
>  static atomic_t nr_text_poke_events __read_mostly;
>  static atomic_t nr_build_id_events __read_mostly;
> +static atomic_t nr_include_guest_events __read_mostly;
> +
> +static atomic_t nr_mediated_pmu_vms;
> +static DEFINE_MUTEX(perf_mediated_pmu_mutex);
> +
> +/* !exclude_guest event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */
> +static inline bool is_include_guest_event(struct perf_event *event)
> +{
> +	if ((event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) &&
> +	    !event->attr.exclude_guest)
> +		return true;
> +
> +	return false;
> +}
>  
>  static LIST_HEAD(pmus);
>  static DEFINE_MUTEX(pmus_lock);
> @@ -5212,6 +5226,9 @@ static void _free_event(struct perf_event *event)
>  
>  	unaccount_event(event);
>  
> +	if (is_include_guest_event(event))
> +		atomic_dec(&nr_include_guest_events);
> +
>  	security_perf_event_free(event);
>  
>  	if (event->rb) {
> @@ -5769,6 +5786,36 @@ u64 perf_event_pause(struct perf_event *event, bool reset)
>  }
>  EXPORT_SYMBOL_GPL(perf_event_pause);
>  
> +/*
> + * Currently invoked at VM creation to
> + * - Check whether there are existing !exclude_guest events of PMU with
> + *   PERF_PMU_CAP_PASSTHROUGH_VPMU
> + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on
> + *   PMUs with PERF_PMU_CAP_PASSTHROUGH_VPMU
> + *
> + * No impact for the PMU without PERF_PMU_CAP_PASSTHROUGH_VPMU. The perf
> + * still owns all the PMU resources.
> + */
> +int perf_get_mediated_pmu(void)
> +{
> +	guard(mutex)(&perf_mediated_pmu_mutex);
> +	if (atomic_inc_not_zero(&nr_mediated_pmu_vms))
> +		return 0;
> +
> +	if (atomic_read(&nr_include_guest_events))
> +		return -EBUSY;
> +
> +	atomic_inc(&nr_mediated_pmu_vms);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(perf_get_mediated_pmu);
> +
> +void perf_put_mediated_pmu(void)
> +{
> +	atomic_dec(&nr_mediated_pmu_vms);
> +}
> +EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
> +
>  /*
>   * Holding the top-level event's child_mutex means that any
>   * descendant process that has inherited this event will block
> @@ -11907,6 +11954,17 @@ static void account_event(struct perf_event *event)
>  	account_pmu_sb_event(event);
>  }
>  
> +static int perf_account_include_guest_event(void)
> +{
> +	guard(mutex)(&perf_mediated_pmu_mutex);
> +
> +	if (atomic_read(&nr_mediated_pmu_vms))
> +		return -EACCES;

Kan, Namhyung posted a patchset
https://lore.kernel.org/all/20240904064131.2377873-1-namhyung@xxxxxxxxxx/
which would remove to set exclude_guest flag from perf tools by default.
This may impact current mediated vPMU solution, but fortunately the
patchset provides a fallback mechanism to add exclude_guest flag if kernel
returns "EOPNOTSUPP".

So we'd better return "EOPNOTSUPP" instead of "EACCES" here. BTW, returning
"EOPNOTSUPP" here looks more reasonable than "EACCES".

> +
> +	atomic_inc(&nr_include_guest_events);
> +	return 0;
> +}
> +
>  /*
>   * Allocate and initialize an event structure
>   */
> @@ -12114,11 +12172,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>  	if (err)
>  		goto err_callchain_buffer;
>  
> +	if (is_include_guest_event(event)) {
> +		err = perf_account_include_guest_event();
> +		if (err)
> +			goto err_security_alloc;
> +	}
> +
>  	/* symmetric to unaccount_event() in _free_event() */
>  	account_event(event);
>  
>  	return event;
>  
> +err_security_alloc:
> +	security_perf_event_free(event);
>  err_callchain_buffer:
>  	if (!event->parent) {
>  		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)