Re: [PATCH V1 3/4] KVM: x86/vPMU: Implement AMD PMU support for KVM

Radim Krčmář <rkrcmar@xxxxxxxxxx> · Mon, 3 Nov 2014 19:17:20 +0100

2014-10-31 12:05-0400, Wei Huang:
> This patch implemented vPMU for AMD platform. The design piggybacks
> on the existing Intel structs (kvm_pmu and kvm_pmc), but only uses
> the parts of generic counters. The kvm_pmu_ops interface is also
> initialized in this patch.
> 
> Signed-off-by: Wei Huang <wei@xxxxxxxxxx>
> ---
>  arch/x86/kvm/pmu_amd.c | 332 ++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 318 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
> index 6d6e1c3..19cfe26 100644
> --- a/arch/x86/kvm/pmu_amd.c
> +++ b/arch/x86/kvm/pmu_amd.c
> @@ -16,58 +16,362 @@
>  #include <linux/types.h>
>  #include <linux/kvm_host.h>
>  #include <linux/perf_event.h>
> -#include <asm/perf_event.h>
>  #include "x86.h"
>  #include "cpuid.h"
>  #include "lapic.h"
>  
> -void amd_pmu_cpuid_update(struct kvm_vcpu *vcpu)
> +/* duplicated from amd_perfmon_event_map, K7 and above should work */
> +static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
> +	[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
> +	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
> +	[2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES },
> +	[3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES },

> +	[4] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
> +	[5] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },

amd_perfmon_event_map has 0xc2 and 0xc3.

> +	[6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
> +	[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
> +};
> +
> +static inline bool pmc_is_gp(struct kvm_pmc *pmc)
>  {
> +	return pmc->type == KVM_PMC_GP;
>  }

What is the benefit of having the same implementation in both files?

I'd prefer to have it in pmu.h and I'll try to note all functions that
look identical.  As always, you can ignore anything in parentheses,
there are only few real issues with this patch.

>  
> -int amd_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)
> +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
> +					 u32 base)

(pmu.h)

>  {
> -	return 1;
> +	if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
> +		return &pmu->gp_counters[msr - base];
> +
> +	return NULL;
>  }
>  
> -int amd_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
> +static inline u64 pmc_bitmask(struct kvm_pmc *pmc)

(pmu.h)

>  {
> -	return 1;
> +	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
> +
> +	return pmu->counter_bitmask[pmc->type];
>  }
>  
> -int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> +static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
>  {
> -	return 1;
> +	return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + idx, MSR_K7_EVNTSEL0);
>  }
>  
> -int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
> +void amd_deliver_pmi(struct kvm_vcpu *vcpu)

static.

(pmu.c, rename to deliver_pmi and reuse for intel.)

>  {
> -	return 1;
> +	if (vcpu->arch.apic)
> +		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
>  }
>  
> -bool amd_is_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
> +static void trigger_pmi(struct irq_work *irq_work)

(pmu.c)

>  {
> -	return 0;
> +	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
> +			irq_work);
> +	struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
> +			arch.pmu);
> +
> +	amd_deliver_pmi(vcpu);
>  }
>  
> -void amd_deliver_pmi(struct kvm_vcpu *vcpu)
> +static u64 read_pmc(struct kvm_pmc *pmc)

(pmu.c)

> +{
> +	u64 counter, enabled, running, result, delta, prev;
> +
> +	counter = pmc->counter;
> +	prev = pmc->counter;
> +
> +	if (pmc->perf_event) {
> +		delta = perf_event_read_value(pmc->perf_event,
> +					      &enabled, &running);
> +		counter += delta;
> +	}
> +
> +	result = counter & pmc_bitmask(pmc);
> +
> +	return result;
> +}

No.  The one intel uses is better.
(This one looks like a relic from experimenting.)

> +
> +static void stop_counter(struct kvm_pmc *pmc)

(pmu.c)

> +{
> +	if (pmc->perf_event) {
> +		pmc->counter = read_pmc(pmc);
> +		perf_event_release_kernel(pmc->perf_event);
> +		pmc->perf_event = NULL;
> +	}
> +}
> +
> +static unsigned find_hw_type_event(struct kvm_pmu *pmu, u8 event_select,
> +				u8 unit_mask)
> +{

(Intel calls this find_arch_event.)

> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
> +		if (amd_event_mapping[i].eventsel == event_select
> +		    && amd_event_mapping[i].unit_mask == unit_mask)

(And it could be less extensible, but shorter:
  			return amd_event_mapping[i].event_type;
  	}
  	return PERF_COUNT_HW_MAX;
  }
)

> +			break;
> +
> +	if (i == ARRAY_SIZE(amd_event_mapping))
> +		return PERF_COUNT_HW_MAX;
> +
> +	return amd_event_mapping[i].event_type;
> +}
> +
> +static void kvm_perf_overflow_intr(struct perf_event *perf_event,
> +		struct perf_sample_data *data, struct pt_regs *regs)
> +{

(pmu.c)

> +	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
> +	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
> +
> +	if (!test_and_set_bit(pmc->idx,
> +			      (unsigned long *)&pmu->reprogram_pmi)) {

(The useless set_bit for intel is not that bad to keep them separate;
 please mind my radicality when it comes to abstracting.)

> +		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
> +
> +		if (!kvm_is_in_guest())
> +			irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
> +		else
> +			kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
> +	}
> +}
> +
> +static void kvm_perf_overflow(struct perf_event *perf_event,
> +			      struct perf_sample_data *data,
> +			      struct pt_regs *regs)
> +{

(pmu.c, ditto.)

> +	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
> +	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
> +
> +	if (!test_and_set_bit(pmc->idx,
> +			      (unsigned long *)&pmu->reprogram_pmi)) {
> +		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
> +	}
> +}
> +
> +static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
> +		unsigned config, bool exclude_user, bool exclude_kernel,
> +		bool intr)

(Could be merged in my world, I'll comment on previous patch.)

> +{
> +	struct perf_event *event;
> +	struct perf_event_attr attr = {
> +		.type = type,
> +		.size = sizeof(attr),
> +		.pinned = true,
> +		.exclude_idle = true,
> +		.exclude_host = 1,
> +		.exclude_user = exclude_user,
> +		.exclude_kernel = exclude_kernel,
> +		.config = config,
> +	};
> +
> +	attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
> +
> +	event = perf_event_create_kernel_counter(&attr, -1, current,
> +						 intr?kvm_perf_overflow_intr :
> +						 kvm_perf_overflow, pmc);
> +	if (IS_ERR(event)) {
> +		printk_once("kvm: pmu event creation failed %ld\n",
> +			    PTR_ERR(event));
> +		return;
> +	}
> +
> +	pmc->perf_event = event;
> +	clear_bit(pmc->idx, (unsigned long *)&pmc->vcpu->arch.pmu.reprogram_pmi);
> +}
> +
> +
> +static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
> +{

(Ditto.)

> +	unsigned config, type = PERF_TYPE_RAW;
> +	u8 event_select, unit_mask;
> +
> +	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
> +		printk_once("kvm pmu: pin control bit is ignored\n");
> +
> +	pmc->eventsel = eventsel;
> +
> +	stop_counter(pmc);
> +
> +	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE))
> +		return;
> +
> +	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
> +	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
> +
> +	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
> +			  ARCH_PERFMON_EVENTSEL_INV |
> +			  ARCH_PERFMON_EVENTSEL_CMASK |
> +			  HSW_IN_TX |
> +			  HSW_IN_TX_CHECKPOINTED))) {
> +		config = find_hw_type_event(&pmc->vcpu->arch.pmu, event_select,
> +					 unit_mask);
> +		if (config != PERF_COUNT_HW_MAX)
> +			type = PERF_TYPE_HARDWARE;
> +	}
> +
> +	if (type == PERF_TYPE_RAW)
> +		config = eventsel & X86_RAW_EVENT_MASK;
> +
> +	reprogram_counter(pmc, type, config,
> +			  !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
> +			  !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
> +			  eventsel & ARCH_PERFMON_EVENTSEL_INT);
> +}
> +
> +static void reprogram_idx(struct kvm_pmu *pmu, int idx)
>  {
> +	struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
> +
> +	if (!pmc || !pmc_is_gp(pmc))
> +		return;
> +	reprogram_gp_counter(pmc, pmc->eventsel);
>  }
>  
>  void amd_handle_pmu_event(struct kvm_vcpu *vcpu)
>  {

static.

(pmu.c)

> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +	u64 bitmask;
> +	int bit;
> +
> +	bitmask = pmu->reprogram_pmi;
> +
> +	for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
> +		struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
> +
> +		if (unlikely(!pmc || !pmc->perf_event)) {
> +			clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
> +			continue;
> +		}
> +
> +		reprogram_idx(pmu, bit);
> +	}
>  }
>  
>  void amd_pmu_reset(struct kvm_vcpu *vcpu)

static.

>  {
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +	int i;
> +
> +	irq_work_sync(&pmu->irq_work);
> +
> +	for (i = 0; i < AMD64_NUM_COUNTERS; i++) {
> +		struct kvm_pmc *pmc = &pmu->gp_counters[i];
> +
> +		stop_counter(pmc);
> +		pmc->counter = pmc->eventsel = 0;
> +	}
> +}
> +
> +void amd_pmu_destroy(struct kvm_vcpu *vcpu)

static.

> +{
> +	amd_pmu_reset(vcpu);
> +}
> +
> +int amd_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)

static.

> +{
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +
> +	pmc &= ~(3u << 30);

(-1u >> 2.)

> +	return (pmc >= pmu->nr_arch_gp_counters);
> +}
> +
> +int amd_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)

static.

> +{
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +	struct kvm_pmc *counters;
> +	u64 ctr;
> +
> +	pmc &= ~(3u << 30);
> +	if (pmc >= pmu->nr_arch_gp_counters)
> +		return 1;
> +
> +	counters = pmu->gp_counters;
> +	ctr = read_pmc(&counters[pmc]);
> +	*data = ctr;
> +
> +	return 0;
> +}
> +
> +int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)

static.

> +{
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +	struct kvm_pmc *pmc;
> +	u32 index = msr_info->index;
> +	u64 data = msr_info->data;
> +
> +	if ((pmc = get_gp_pmc(pmu, index, MSR_K7_PERFCTR0))) {
> +		if (!msr_info->host_initiated)
> +			data = (s64)data;
> +		pmc->counter += data - read_pmc(pmc);
> +		return 0;
> +	} else if ((pmc = get_gp_pmc(pmu, index, MSR_K7_EVNTSEL0))) {
> +		if (data == pmc->eventsel)
> +			return 0;
> +		if (!(data & pmu->reserved_bits)) {
> +			reprogram_gp_counter(pmc, data);
> +			return 0;
> +		}
> +	}
> +
> +	return 1;
> +}
> +
> +int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)

static.

> +{
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +	struct kvm_pmc *pmc;
> +
> +	if ((pmc = get_gp_pmc(pmu, index, MSR_K7_PERFCTR0))) {
> +		*data = read_pmc(pmc);
> +		return 0;
> +	} else if ((pmc = get_gp_pmc(pmu, index, MSR_K7_EVNTSEL0))) {
> +		*data = pmc->eventsel;
> +		return 0;
> +	}
> +
> +	return 1;
> +}
> +
> +void amd_pmu_cpuid_update(struct kvm_vcpu *vcpu)

static.

> +{
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +
> +	pmu->nr_arch_gp_counters = 0;
> +	pmu->nr_arch_fixed_counters = 0;
> +	pmu->counter_bitmask[KVM_PMC_GP] = 0;
> +	pmu->version = 0;
> +	pmu->reserved_bits = 0xffffffff00200000ull;
> +

> +	/* configuration */
> +	pmu->nr_arch_gp_counters = 4;

AMD64_NUM_COUNTERS?

> +	pmu->nr_arch_fixed_counters = 0;
> +	pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;

amd_pmu_cpuid_update doesn't depend on cpuid at all.
(I'd keep this function empty.)

>  }
>  
>  void amd_pmu_init(struct kvm_vcpu *vcpu)

static.

>  {
> +	int i;
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +
> +	memset(pmu, 0, sizeof(*pmu));
> +	for (i = 0; i < AMD64_NUM_COUNTERS ; i++) {
> +		pmu->gp_counters[i].type = KVM_PMC_GP;
> +		pmu->gp_counters[i].vcpu = vcpu;
> +		pmu->gp_counters[i].idx = i;
> +	}
> +
> +	init_irq_work(&pmu->irq_work, trigger_pmi);
> +	amd_pmu_cpuid_update(vcpu);
>  }
>  
> -void amd_pmu_destroy(struct kvm_vcpu *vcpu)
> +bool amd_is_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)

static.

>  {
> +	struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +	int ret;
> +
> +	ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) ||
> +		get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
> +
> +	return ret;
>  }
>  
>  struct kvm_pmu_ops amd_pmu_ops = {
> -- 
> 1.8.3.1
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html