Re: [RFC PATCH v3 06/58] perf: Support get/put passthrough PMU interfaces

"Liang, Kan" <kan.liang@xxxxxxxxxxxxxxx> · Fri, 6 Sep 2024 11:40:51 -0400

On 2024-09-06 6:59 a.m., Mi, Dapeng wrote:
> 
> On 8/1/2024 12:58 PM, Mingwei Zhang wrote:
>> From: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
>>
>> Currently, the guest and host share the PMU resources when a guest is
>> running. KVM has to create an extra virtual event to simulate the
>> guest's event, which brings several issues, e.g., high overhead, not
>> accuracy and etc.
>>
>> A new passthrough PMU method is proposed to address the issue. It requires
>> that the PMU resources can be fully occupied by the guest while it's
>> running. Two new interfaces are implemented to fulfill the requirement.
>> The hypervisor should invoke the interface while creating a guest which
>> wants the passthrough PMU capability.
>>
>> The PMU resources should only be temporarily occupied as a whole when a
>> guest is running. When the guest is out, the PMU resources are still
>> shared among different users.
>>
>> The exclude_guest event modifier is used to guarantee the exclusive
>> occupation of the PMU resources. When creating a guest, the hypervisor
>> should check whether there are !exclude_guest events in the system.
>> If yes, the creation should fail. Because some PMU resources have been
>> occupied by other users.
>> If no, the PMU resources can be safely accessed by the guest directly.
>> Perf guarantees that no new !exclude_guest events are created when a
>> guest is running.
>>
>> Only the passthrough PMU is affected, but not for other PMU e.g., uncore
>> and SW PMU. The behavior of those PMUs are not changed. The guest
>> enter/exit interfaces should only impact the supported PMUs.
>> Add a new PERF_PMU_CAP_PASSTHROUGH_VPMU flag to indicate the PMUs that
>> support the feature.
>>
>> Add nr_include_guest_events to track the !exclude_guest events of PMU
>> with PERF_PMU_CAP_PASSTHROUGH_VPMU.
>>
>> Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx>
>> Signed-off-by: Kan Liang <kan.liang@xxxxxxxxxxxxxxx>
>> Tested-by: Yongwei Ma <yongwei.ma@xxxxxxxxx>
>> Signed-off-by: Mingwei Zhang <mizhang@xxxxxxxxxx>
>> ---
>>  include/linux/perf_event.h | 10 ++++++
>>  kernel/events/core.c       | 66 ++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 76 insertions(+)
>>
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index a5304ae8c654..45d1ea82aa21 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -291,6 +291,7 @@ struct perf_event_pmu_context;
>>  #define PERF_PMU_CAP_NO_EXCLUDE			0x0040
>>  #define PERF_PMU_CAP_AUX_OUTPUT			0x0080
>>  #define PERF_PMU_CAP_EXTENDED_HW_TYPE		0x0100
>> +#define PERF_PMU_CAP_PASSTHROUGH_VPMU		0x0200
>>  
>>  struct perf_output_handle;
>>  
>> @@ -1728,6 +1729,8 @@ extern void perf_event_task_tick(void);
>>  extern int perf_event_account_interrupt(struct perf_event *event);
>>  extern int perf_event_period(struct perf_event *event, u64 value);
>>  extern u64 perf_event_pause(struct perf_event *event, bool reset);
>> +int perf_get_mediated_pmu(void);
>> +void perf_put_mediated_pmu(void);
>>  #else /* !CONFIG_PERF_EVENTS: */
>>  static inline void *
>>  perf_aux_output_begin(struct perf_output_handle *handle,
>> @@ -1814,6 +1817,13 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset)
>>  {
>>  	return 0;
>>  }
>> +
>> +static inline int perf_get_mediated_pmu(void)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline void perf_put_mediated_pmu(void)			{ }
>>  #endif
>>  
>>  #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 8f908f077935..45868d276cde 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -402,6 +402,20 @@ static atomic_t nr_bpf_events __read_mostly;
>>  static atomic_t nr_cgroup_events __read_mostly;
>>  static atomic_t nr_text_poke_events __read_mostly;
>>  static atomic_t nr_build_id_events __read_mostly;
>> +static atomic_t nr_include_guest_events __read_mostly;
>> +
>> +static atomic_t nr_mediated_pmu_vms;
>> +static DEFINE_MUTEX(perf_mediated_pmu_mutex);
>> +
>> +/* !exclude_guest event of PMU with PERF_PMU_CAP_PASSTHROUGH_VPMU */
>> +static inline bool is_include_guest_event(struct perf_event *event)
>> +{
>> +	if ((event->pmu->capabilities & PERF_PMU_CAP_PASSTHROUGH_VPMU) &&
>> +	    !event->attr.exclude_guest)
>> +		return true;
>> +
>> +	return false;
>> +}
>>  
>>  static LIST_HEAD(pmus);
>>  static DEFINE_MUTEX(pmus_lock);
>> @@ -5212,6 +5226,9 @@ static void _free_event(struct perf_event *event)
>>  
>>  	unaccount_event(event);
>>  
>> +	if (is_include_guest_event(event))
>> +		atomic_dec(&nr_include_guest_events);
>> +
>>  	security_perf_event_free(event);
>>  
>>  	if (event->rb) {
>> @@ -5769,6 +5786,36 @@ u64 perf_event_pause(struct perf_event *event, bool reset)
>>  }
>>  EXPORT_SYMBOL_GPL(perf_event_pause);
>>  
>> +/*
>> + * Currently invoked at VM creation to
>> + * - Check whether there are existing !exclude_guest events of PMU with
>> + *   PERF_PMU_CAP_PASSTHROUGH_VPMU
>> + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on
>> + *   PMUs with PERF_PMU_CAP_PASSTHROUGH_VPMU
>> + *
>> + * No impact for the PMU without PERF_PMU_CAP_PASSTHROUGH_VPMU. The perf
>> + * still owns all the PMU resources.
>> + */
>> +int perf_get_mediated_pmu(void)
>> +{
>> +	guard(mutex)(&perf_mediated_pmu_mutex);
>> +	if (atomic_inc_not_zero(&nr_mediated_pmu_vms))
>> +		return 0;
>> +
>> +	if (atomic_read(&nr_include_guest_events))
>> +		return -EBUSY;
>> +
>> +	atomic_inc(&nr_mediated_pmu_vms);
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(perf_get_mediated_pmu);
>> +
>> +void perf_put_mediated_pmu(void)
>> +{
>> +	atomic_dec(&nr_mediated_pmu_vms);
>> +}
>> +EXPORT_SYMBOL_GPL(perf_put_mediated_pmu);
>> +
>>  /*
>>   * Holding the top-level event's child_mutex means that any
>>   * descendant process that has inherited this event will block
>> @@ -11907,6 +11954,17 @@ static void account_event(struct perf_event *event)
>>  	account_pmu_sb_event(event);
>>  }
>>  
>> +static int perf_account_include_guest_event(void)
>> +{
>> +	guard(mutex)(&perf_mediated_pmu_mutex);
>> +
>> +	if (atomic_read(&nr_mediated_pmu_vms))
>> +		return -EACCES;
> 
> Kan, Namhyung posted a patchset
> https://lore.kernel.org/all/20240904064131.2377873-1-namhyung@xxxxxxxxxx/
> which would remove to set exclude_guest flag from perf tools by default.
> This may impact current mediated vPMU solution, but fortunately the
> patchset provides a fallback mechanism to add exclude_guest flag if kernel
> returns "EOPNOTSUPP".
> 
> So we'd better return "EOPNOTSUPP" instead of "EACCES" here. BTW, returning
> "EOPNOTSUPP" here looks more reasonable than "EACCES".

It seems the existing Apple M1 PMU has ready returned "EOPNOTSUPP" for
the !exclude_guest. Yes, we should use the same error code.

Thanks,
Kan
> 
> 
>> +
>> +	atomic_inc(&nr_include_guest_events);
>> +	return 0;
>> +}
>> +
>>  /*
>>   * Allocate and initialize an event structure
>>   */
>> @@ -12114,11 +12172,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>>  	if (err)
>>  		goto err_callchain_buffer;
>>  
>> +	if (is_include_guest_event(event)) {
>> +		err = perf_account_include_guest_event();
>> +		if (err)
>> +			goto err_security_alloc;
>> +	}
>> +
>>  	/* symmetric to unaccount_event() in _free_event() */
>>  	account_event(event);
>>  
>>  	return event;
>>  
>> +err_security_alloc:
>> +	security_perf_event_free(event);
>>  err_callchain_buffer:
>>  	if (!event->parent) {
>>  		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
>