this patch used perf_event_create_kernel_counter function to setup event in host os. whether the count of event will calculate the part of qemu-kvm userspace, and deliver the nmi interrupt which caused by this part to guest os? thanks. 2011/6/13 Avi Kivity <avi@xxxxxxxxxx>: > Use perf_events to emulate an architectural PMU, version 1. > > Caveats: > - counters that have PMI (interrupt) enabled stop counting after the > interrupt is signalled. This is because we need one-shot samples > that keep counting, which perf doesn't support yet > - some combinations of INV and CMASK are not supported > - counters keep on counting in the host as well as the guest > > Signed-off-by: Avi Kivity <avi@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 29 +++++ > arch/x86/kvm/Makefile | 2 +- > arch/x86/kvm/pmu.c | 255 +++++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/x86.c | 16 ++-- > 4 files changed, 293 insertions(+), 9 deletions(-) > create mode 100644 arch/x86/kvm/pmu.c > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index fc38eca..86f49a2 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -16,6 +16,7 @@ > #include <linux/mmu_notifier.h> > #include <linux/tracepoint.h> > #include <linux/cpumask.h> > +#include <linux/irq_work.h> > > #include <linux/kvm.h> > #include <linux/kvm_para.h> > @@ -287,6 +288,24 @@ struct kvm_mmu { > u64 pdptrs[4]; /* pae */ > }; > > +#define KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS 4 > + > +struct kvm_pmc { > + u64 counter; > + u64 eventsel; > + struct perf_event *perf_event; > + struct kvm_vcpu *vcpu; > +}; > + > +struct kvm_pmu { > + unsigned nr_arch_gp_counters; > + unsigned available_event_types; > + u64 counter_bitmask; > + u8 version; > + struct kvm_pmc gp_counters[KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS]; > + struct irq_work irq_work; > +}; > + > struct kvm_vcpu_arch { > /* > * rip and regs accesses must go through > @@ -414,6 +433,8 @@ struct kvm_vcpu_arch { > u64 mcg_ctl; > u64 *mce_banks; > > + struct kvm_pmu pmu; > + > /* used for guest single stepping over the given code position */ > unsigned long singlestep_rip; > > @@ -870,4 +891,12 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); > > void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); > > +void kvm_pmu_init(struct kvm_vcpu *vcpu); > +void kvm_pmu_destroy(struct kvm_vcpu *vcpu); > +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); > +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); > +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); > +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); > +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); > + > #endif /* _ASM_X86_KVM_HOST_H */ > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > index f15501f..cfca03f 100644 > --- a/arch/x86/kvm/Makefile > +++ b/arch/x86/kvm/Makefile > @@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) > > kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ > - i8254.o timer.o > + i8254.o timer.o pmu.o > kvm-intel-y += vmx.o > kvm-amd-y += svm.o > > diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c > new file mode 100644 > index 0000000..763e763 > --- /dev/null > +++ b/arch/x86/kvm/pmu.c > @@ -0,0 +1,255 @@ > +/* > + * Kernel-based Virtual Machine -- Performane Monitoring Unit support > + * > + * Copyright 2011 Red Hat, Inc. and/or its affiliates. > + * > + * Authors: > + * Avi Kivity <avi@xxxxxxxxxx> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include <linux/types.h> > +#include <linux/kvm_host.h> > +#include <linux/perf_event.h> > +#include "x86.h" > +#include "pmu.h" > +#include "lapic.h" > + > +static struct kvm_arch_event_perf_mapping { > + u8 eventsel; > + u8 unit_mask; > + unsigned event_type; > + bool inexact; > +} arch_events[] = { > + /* Index must match CPUID 0x0A.EBX bit vector */ > + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, > + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, > + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, > + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, > + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, > + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, > + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, > +}; > + > +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, > + u32 base) > +{ > + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) > + return &pmu->gp_counters[msr - base]; > + return NULL; > +} > + > +static void __kvm_perf_overflow(struct irq_work *irq_work) > +{ > + struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); > + struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, arch.pmu); > + > + if (vcpu->arch.apic) > + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); > +} > + > +static void kvm_perf_overflow(struct perf_event *perf_event, > + int nmi, > + struct perf_sample_data *data, > + struct pt_regs *regs) > +{ > + struct kvm_pmc *pmc = perf_event->overflow_handler_context; > + > + irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); > +} > + > +static u64 read_gp_pmc(struct kvm_pmu *pmu, struct kvm_pmc *pmc) > +{ > + u64 counter, enabled, running; > + > + counter = pmc->counter; > + > + if (pmc->perf_event) > + counter += perf_event_read_value(pmc->perf_event, > + &enabled, &running); > + > + /* FIXME: Scaling needed? */ > + > + return counter & pmu->counter_bitmask; > +} > + > +static int reprogram_gp_counter(struct kvm_pmu *pmu, struct kvm_pmc *pmc, > + u64 eventsel) > +{ > + struct perf_event_attr attr = { }; > + struct perf_event *event; > + int i; > + u8 event_select, unit_mask, cmask; > + perf_overflow_handler_t callback = NULL; > + bool inv; > + > + if (pmc->perf_event) { > + pmc->counter = read_gp_pmc(pmu, pmc); > + perf_event_release_kernel(pmc->perf_event); > + pmc->perf_event = NULL; > + irq_work_sync(&pmu->irq_work); > + pmc->eventsel = eventsel; > + } > + > + if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE)) > + return 0; > + > + attr.type = PERF_TYPE_HARDWARE; > + attr.size = sizeof(attr); > + attr.exclude_idle = true; > + > + event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; > + unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; > + > + for (i = 0; i < ARRAY_SIZE(arch_events); ++i) { > + if (arch_events[i].eventsel == event_select > + && arch_events[i].unit_mask == unit_mask > + && (pmu->available_event_types & (1 << i))) { > + attr.config = arch_events[i].event_type; > + break; > + } > + } > + if (i == ARRAY_SIZE(arch_events)) > + return 1; > + > + attr.exclude_user = !(eventsel & ARCH_PERFMON_EVENTSEL_USR); > + attr.exclude_kernel = !(eventsel & ARCH_PERFMON_EVENTSEL_OS); > + > + if (eventsel & ARCH_PERFMON_EVENTSEL_EDGE) > + printk_once("kvm: pmu ignoring edge bit\n"); > + > + if (eventsel & ARCH_PERFMON_EVENTSEL_INT) { > + callback = kvm_perf_overflow; > + attr.disabled = true; > + } > + > + inv = eventsel & ARCH_PERFMON_EVENTSEL_INV; > + cmask = (eventsel & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; > + > + pmc->eventsel = eventsel; > + > + if (inv || cmask > 1) { > + printk_once("kvm: pmu ignoring difficult inv/cmask combo\n"); > + return 0; > + } > + > + attr.sample_period = (-pmc->counter) & pmu->counter_bitmask; > + > + event = perf_event_create_kernel_counter(&attr, -1, current, > + callback, pmc); > + if (IS_ERR(event)) > + return PTR_ERR(event); > + > + if (callback) > + perf_event_refresh(event, 1); > + > + pmc->perf_event = event; > + return 0; > +} > + > +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) > +{ > + struct kvm_pmu *pmu = &vcpu->arch.pmu; > + > + return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) > + || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0); > +} > + > +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) > +{ > + struct kvm_pmu *pmu = &vcpu->arch.pmu; > + struct kvm_pmc *pmc; > + > + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0))) { > + *data = read_gp_pmc(pmu, pmc); > + return 0; > + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { > + *data = pmc->eventsel; > + return 0; > + } > + return 1; > +} > + > +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) > +{ > + struct kvm_pmu *pmu = &vcpu->arch.pmu; > + struct kvm_pmc *pmc; > + > + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0))) { > + data = (s64)(s32)data; > + pmc->counter += data - read_gp_pmc(pmu, pmc); > + return 0; > + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { > + if (data == pmc->eventsel) > + return 0; > + if (data & 0xffffffff00200000ULL) > + return 1; > + return reprogram_gp_counter(pmu, pmc, data); > + } > + return 1; > +} > + > +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) > +{ > + struct kvm_pmu *pmu = &vcpu->arch.pmu; > + bool fast_mode = pmc & (1u << 31); > + u64 ctr; > + > + pmc &= (1u << 31) - 1; > + if (pmc >= pmu->nr_arch_gp_counters) > + return 1; > + ctr = read_gp_pmc(pmu, &pmu->gp_counters[pmc]); > + if (fast_mode) > + ctr = (u32)ctr; > + *data = ctr; > + > + return 0; > +} > + > +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) > +{ > + struct kvm_pmu *pmu = &vcpu->arch.pmu; > + struct kvm_cpuid_entry2 *entry; > + unsigned bitmap_len; > + > + pmu->nr_arch_gp_counters = 0; > + pmu->version = 0; > + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); > + if (!entry) > + return; > + pmu->version = entry->eax & 0xff; > + pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, > + KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS); > + pmu->counter_bitmask = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; > + bitmap_len = (entry->eax >> 24) & 0xff; > + pmu->available_event_types = ~entry->ebx & ((1ULL << bitmap_len) - 1); > +} > + > +void kvm_pmu_init(struct kvm_vcpu *vcpu) > +{ > + int i; > + struct kvm_pmu *pmu = &vcpu->arch.pmu; > + > + memset(pmu, 0, sizeof(*pmu)); > + for (i = 0; i < KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS; ++i) > + pmu->gp_counters[i].vcpu = vcpu; > + init_irq_work(&pmu->irq_work, __kvm_perf_overflow); > + kvm_pmu_cpuid_update(vcpu); > +} > + > +void kvm_pmu_destroy(struct kvm_vcpu *vcpu) > +{ > + struct kvm_pmu *pmu = &vcpu->arch.pmu; > + struct kvm_pmc *pmc; > + int i; > + > + irq_work_sync(&pmu->irq_work); > + for (i = 0; i < KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS; ++i) { > + pmc = &pmu->gp_counters[i]; > + if (pmc->perf_event) > + perf_event_release_kernel(pmc->perf_event); > + } > +} > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 84f4607..258769f 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -602,6 +602,8 @@ static void update_cpuid(struct kvm_vcpu *vcpu) > if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) > best->ecx |= bit(X86_FEATURE_OSXSAVE); > } > + > + kvm_pmu_cpuid_update(vcpu); > } > > int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) > @@ -1571,8 +1573,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) > * which we perfectly emulate ;-). Any other value should be at least > * reported, some guests depend on them. > */ > - case MSR_P6_EVNTSEL0: > - case MSR_P6_EVNTSEL1: > case MSR_K7_EVNTSEL0: > case MSR_K7_EVNTSEL1: > case MSR_K7_EVNTSEL2: > @@ -1584,8 +1584,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) > /* at least RHEL 4 unconditionally writes to the perfctr registers, > * so we ignore writes to make it happy. > */ > - case MSR_P6_PERFCTR0: > - case MSR_P6_PERFCTR1: > case MSR_K7_PERFCTR0: > case MSR_K7_PERFCTR1: > case MSR_K7_PERFCTR2: > @@ -1622,6 +1620,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) > default: > if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) > return xen_hvm_config(vcpu, data); > + if (kvm_pmu_msr(vcpu, msr)) > + return kvm_pmu_set_msr(vcpu, msr, data); > if (!ignore_msrs) { > pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", > msr, data); > @@ -1782,10 +1782,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) > case MSR_K8_SYSCFG: > case MSR_K7_HWCR: > case MSR_VM_HSAVE_PA: > - case MSR_P6_PERFCTR0: > - case MSR_P6_PERFCTR1: > - case MSR_P6_EVNTSEL0: > - case MSR_P6_EVNTSEL1: > case MSR_K7_EVNTSEL0: > case MSR_K7_PERFCTR0: > case MSR_K8_INT_PENDING_MSG: > @@ -1887,6 +1883,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) > data = 0xbe702111; > break; > default: > + if (kvm_pmu_msr(vcpu, msr)) > + return kvm_pmu_get_msr(vcpu, msr, pdata); > if (!ignore_msrs) { > pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); > return 1; > @@ -6290,6 +6288,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) > goto fail_free_mce_banks; > > kvm_async_pf_hash_reset(vcpu); > + kvm_pmu_init(vcpu); > > return 0; > fail_free_mce_banks: > @@ -6308,6 +6307,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) > { > int idx; > > + kvm_pmu_destroy(vcpu); > kfree(vcpu->arch.mce_banks); > kvm_free_lapic(vcpu); > idx = srcu_read_lock(&vcpu->kvm->srcu); > -- > 1.7.5.3 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html