Use perf_events to emulate an architectural PMU, version 1. Caveats: - counters that have PMI (interrupt) enabled stop counting after the interrupt is signalled. This is because we need one-shot samples that keep counting, which perf doesn't support yet - some combinations of INV and CMASK are not supported - counters keep on counting in the host as well as the guest Signed-off-by: Avi Kivity <avi@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 29 +++++ arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/pmu.c | 255 +++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 16 ++-- 4 files changed, 293 insertions(+), 9 deletions(-) create mode 100644 arch/x86/kvm/pmu.c diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fc38eca..86f49a2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -16,6 +16,7 @@ #include <linux/mmu_notifier.h> #include <linux/tracepoint.h> #include <linux/cpumask.h> +#include <linux/irq_work.h> #include <linux/kvm.h> #include <linux/kvm_para.h> @@ -287,6 +288,24 @@ struct kvm_mmu { u64 pdptrs[4]; /* pae */ }; +#define KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS 4 + +struct kvm_pmc { + u64 counter; + u64 eventsel; + struct perf_event *perf_event; + struct kvm_vcpu *vcpu; +}; + +struct kvm_pmu { + unsigned nr_arch_gp_counters; + unsigned available_event_types; + u64 counter_bitmask; + u8 version; + struct kvm_pmc gp_counters[KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS]; + struct irq_work irq_work; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -414,6 +433,8 @@ struct kvm_vcpu_arch { u64 mcg_ctl; u64 *mce_banks; + struct kvm_pmu pmu; + /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; @@ -870,4 +891,12 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..cfca03f 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o + i8254.o timer.o pmu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c new file mode 100644 index 0000000..763e763 --- /dev/null +++ b/arch/x86/kvm/pmu.c @@ -0,0 +1,255 @@ +/* + * Kernel-based Virtual Machine -- Performane Monitoring Unit support + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@xxxxxxxxxx> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/types.h> +#include <linux/kvm_host.h> +#include <linux/perf_event.h> +#include "x86.h" +#include "pmu.h" +#include "lapic.h" + +static struct kvm_arch_event_perf_mapping { + u8 eventsel; + u8 unit_mask; + unsigned event_type; + bool inexact; +} arch_events[] = { + /* Index must match CPUID 0x0A.EBX bit vector */ + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, +}; + +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, + u32 base) +{ + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) + return &pmu->gp_counters[msr - base]; + return NULL; +} + +static void __kvm_perf_overflow(struct irq_work *irq_work) +{ + struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); + struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, arch.pmu); + + if (vcpu->arch.apic) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); +} + +static void kvm_perf_overflow(struct perf_event *perf_event, + int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + + irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); +} + +static u64 read_gp_pmc(struct kvm_pmu *pmu, struct kvm_pmc *pmc) +{ + u64 counter, enabled, running; + + counter = pmc->counter; + + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, + &enabled, &running); + + /* FIXME: Scaling needed? */ + + return counter & pmu->counter_bitmask; +} + +static int reprogram_gp_counter(struct kvm_pmu *pmu, struct kvm_pmc *pmc, + u64 eventsel) +{ + struct perf_event_attr attr = { }; + struct perf_event *event; + int i; + u8 event_select, unit_mask, cmask; + perf_overflow_handler_t callback = NULL; + bool inv; + + if (pmc->perf_event) { + pmc->counter = read_gp_pmc(pmu, pmc); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + irq_work_sync(&pmu->irq_work); + pmc->eventsel = eventsel; + } + + if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE)) + return 0; + + attr.type = PERF_TYPE_HARDWARE; + attr.size = sizeof(attr); + attr.exclude_idle = true; + + event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + + for (i = 0; i < ARRAY_SIZE(arch_events); ++i) { + if (arch_events[i].eventsel == event_select + && arch_events[i].unit_mask == unit_mask + && (pmu->available_event_types & (1 << i))) { + attr.config = arch_events[i].event_type; + break; + } + } + if (i == ARRAY_SIZE(arch_events)) + return 1; + + attr.exclude_user = !(eventsel & ARCH_PERFMON_EVENTSEL_USR); + attr.exclude_kernel = !(eventsel & ARCH_PERFMON_EVENTSEL_OS); + + if (eventsel & ARCH_PERFMON_EVENTSEL_EDGE) + printk_once("kvm: pmu ignoring edge bit\n"); + + if (eventsel & ARCH_PERFMON_EVENTSEL_INT) { + callback = kvm_perf_overflow; + attr.disabled = true; + } + + inv = eventsel & ARCH_PERFMON_EVENTSEL_INV; + cmask = (eventsel & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + + pmc->eventsel = eventsel; + + if (inv || cmask > 1) { + printk_once("kvm: pmu ignoring difficult inv/cmask combo\n"); + return 0; + } + + attr.sample_period = (-pmc->counter) & pmu->counter_bitmask; + + event = perf_event_create_kernel_counter(&attr, -1, current, + callback, pmc); + if (IS_ERR(event)) + return PTR_ERR(event); + + if (callback) + perf_event_refresh(event, 1); + + pmc->perf_event = event; + return 0; +} + +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) + || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0); +} + +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0))) { + *data = read_gp_pmc(pmu, pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { + *data = pmc->eventsel; + return 0; + } + return 1; +} + +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0))) { + data = (s64)(s32)data; + pmc->counter += data - read_gp_pmc(pmu, pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { + if (data == pmc->eventsel) + return 0; + if (data & 0xffffffff00200000ULL) + return 1; + return reprogram_gp_counter(pmu, pmc, data); + } + return 1; +} + +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool fast_mode = pmc & (1u << 31); + u64 ctr; + + pmc &= (1u << 31) - 1; + if (pmc >= pmu->nr_arch_gp_counters) + return 1; + ctr = read_gp_pmc(pmu, &pmu->gp_counters[pmc]); + if (fast_mode) + ctr = (u32)ctr; + *data = ctr; + + return 0; +} + +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_cpuid_entry2 *entry; + unsigned bitmap_len; + + pmu->nr_arch_gp_counters = 0; + pmu->version = 0; + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return; + pmu->version = entry->eax & 0xff; + pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, + KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS); + pmu->counter_bitmask = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; + bitmap_len = (entry->eax >> 24) & 0xff; + pmu->available_event_types = ~entry->ebx & ((1ULL << bitmap_len) - 1); +} + +void kvm_pmu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + memset(pmu, 0, sizeof(*pmu)); + for (i = 0; i < KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS; ++i) + pmu->gp_counters[i].vcpu = vcpu; + init_irq_work(&pmu->irq_work, __kvm_perf_overflow); + kvm_pmu_cpuid_update(vcpu); +} + +void kvm_pmu_destroy(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + int i; + + irq_work_sync(&pmu->irq_work); + for (i = 0; i < KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS; ++i) { + pmc = &pmu->gp_counters[i]; + if (pmc->perf_event) + perf_event_release_kernel(pmc->perf_event); + } +} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84f4607..258769f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -602,6 +602,8 @@ static void update_cpuid(struct kvm_vcpu *vcpu) if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) best->ecx |= bit(X86_FEATURE_OSXSAVE); } + + kvm_pmu_cpuid_update(vcpu); } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1571,8 +1573,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) * which we perfectly emulate ;-). Any other value should be at least * reported, some guests depend on them. */ - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: @@ -1584,8 +1584,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. */ - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: case MSR_K7_PERFCTR0: case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: @@ -1622,6 +1620,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr, data); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); @@ -1782,10 +1782,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_SYSCFG: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K7_PERFCTR0: case MSR_K8_INT_PENDING_MSG: @@ -1887,6 +1883,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = 0xbe702111; break; default: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -6290,6 +6288,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; kvm_async_pf_hash_reset(vcpu); + kvm_pmu_init(vcpu); return 0; fail_free_mce_banks: @@ -6308,6 +6307,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { int idx; + kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); -- 1.7.5.3 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html