Re: [PATCH v2 05/11] KVM: Expose a version 1 architectural PMU to guests

lidong chen <chen.lidong.kernel@xxxxxxxxx> · Mon, 27 Jun 2011 22:10:38 +0800

this patch used perf_event_create_kernel_counter function to setup
event in host os.
whether the count of event will calculate the part of qemu-kvm
userspace, and deliver the nmi interrupt which caused by this part to
guest os?

thanks.

2011/6/13 Avi Kivity <avi@xxxxxxxxxx>:
> Use perf_events to emulate an architectural PMU, version 1.
>
> Caveats:
> - counters that have PMI (interrupt) enabled stop counting after the
>  interrupt is signalled.  This is because we need one-shot samples
>  that keep counting, which perf doesn't support yet
> - some combinations of INV and CMASK are not supported
> - counters keep on counting in the host as well as the guest
>
> Signed-off-by: Avi Kivity <avi@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |   29 +++++
>  arch/x86/kvm/Makefile           |    2 +-
>  arch/x86/kvm/pmu.c              |  255 +++++++++++++++++++++++++++++++++++++++
>  arch/x86/kvm/x86.c              |   16 ++--
>  4 files changed, 293 insertions(+), 9 deletions(-)
>  create mode 100644 arch/x86/kvm/pmu.c
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fc38eca..86f49a2 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -16,6 +16,7 @@
>  #include <linux/mmu_notifier.h>
>  #include <linux/tracepoint.h>
>  #include <linux/cpumask.h>
> +#include <linux/irq_work.h>
>
>  #include <linux/kvm.h>
>  #include <linux/kvm_para.h>
> @@ -287,6 +288,24 @@ struct kvm_mmu {
>        u64 pdptrs[4]; /* pae */
>  };
>
> +#define KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS 4
> +
> +struct kvm_pmc {
> +       u64 counter;
> +       u64 eventsel;
> +       struct perf_event *perf_event;
> +       struct kvm_vcpu *vcpu;
> +};
> +
> +struct kvm_pmu {
> +       unsigned nr_arch_gp_counters;
> +       unsigned available_event_types;
> +       u64 counter_bitmask;
> +       u8 version;
> +       struct kvm_pmc gp_counters[KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS];
> +       struct irq_work irq_work;
> +};
> +
>  struct kvm_vcpu_arch {
>        /*
>         * rip and regs accesses must go through
> @@ -414,6 +433,8 @@ struct kvm_vcpu_arch {
>        u64 mcg_ctl;
>        u64 *mce_banks;
>
> +       struct kvm_pmu pmu;
> +
>        /* used for guest single stepping over the given code position */
>        unsigned long singlestep_rip;
>
> @@ -870,4 +891,12 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
>
>  void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
>
> +void kvm_pmu_init(struct kvm_vcpu *vcpu);
> +void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
> +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
> +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
> +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
> +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
> +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
> +
>  #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index f15501f..cfca03f 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API)       += $(addprefix ../../../virt/kvm/, iommu.o)
>  kvm-$(CONFIG_KVM_ASYNC_PF)     += $(addprefix ../../../virt/kvm/, async_pf.o)
>
>  kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
> -                          i8254.o timer.o
> +                          i8254.o timer.o pmu.o
>  kvm-intel-y            += vmx.o
>  kvm-amd-y              += svm.o
>
> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
> new file mode 100644
> index 0000000..763e763
> --- /dev/null
> +++ b/arch/x86/kvm/pmu.c
> @@ -0,0 +1,255 @@
> +/*
> + * Kernel-based Virtual Machine -- Performane Monitoring Unit support
> + *
> + * Copyright 2011 Red Hat, Inc. and/or its affiliates.
> + *
> + * Authors:
> + *   Avi Kivity   <avi@xxxxxxxxxx>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include <linux/types.h>
> +#include <linux/kvm_host.h>
> +#include <linux/perf_event.h>
> +#include "x86.h"
> +#include "pmu.h"
> +#include "lapic.h"
> +
> +static struct kvm_arch_event_perf_mapping {
> +       u8 eventsel;
> +       u8 unit_mask;
> +       unsigned event_type;
> +       bool inexact;
> +} arch_events[] = {
> +       /* Index must match CPUID 0x0A.EBX bit vector */
> +       [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
> +       [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
> +       [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
> +       [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
> +       [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
> +       [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
> +       [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
> +};
> +
> +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
> +                                        u32 base)
> +{
> +       if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
> +               return &pmu->gp_counters[msr - base];
> +       return NULL;
> +}
> +
> +static void __kvm_perf_overflow(struct irq_work *irq_work)
> +{
> +       struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
> +       struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, arch.pmu);
> +
> +       if (vcpu->arch.apic)
> +               kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
> +}
> +
> +static void kvm_perf_overflow(struct perf_event *perf_event,
> +                             int nmi,
> +                             struct perf_sample_data *data,
> +                             struct pt_regs *regs)
> +{
> +       struct kvm_pmc *pmc = perf_event->overflow_handler_context;
> +
> +       irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
> +}
> +
> +static u64 read_gp_pmc(struct kvm_pmu *pmu, struct kvm_pmc *pmc)
> +{
> +       u64 counter, enabled, running;
> +
> +       counter = pmc->counter;
> +
> +       if (pmc->perf_event)
> +               counter += perf_event_read_value(pmc->perf_event,
> +                                                &enabled, &running);
> +
> +       /* FIXME: Scaling needed? */
> +
> +       return counter & pmu->counter_bitmask;
> +}
> +
> +static int reprogram_gp_counter(struct kvm_pmu *pmu, struct kvm_pmc *pmc,
> +                               u64 eventsel)
> +{
> +       struct perf_event_attr attr = { };
> +       struct perf_event *event;
> +       int i;
> +       u8 event_select, unit_mask, cmask;
> +       perf_overflow_handler_t callback = NULL;
> +       bool inv;
> +
> +       if (pmc->perf_event) {
> +               pmc->counter = read_gp_pmc(pmu, pmc);
> +               perf_event_release_kernel(pmc->perf_event);
> +               pmc->perf_event = NULL;
> +               irq_work_sync(&pmu->irq_work);
> +               pmc->eventsel = eventsel;
> +       }
> +
> +       if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE))
> +               return 0;
> +
> +       attr.type = PERF_TYPE_HARDWARE;
> +       attr.size = sizeof(attr);
> +       attr.exclude_idle = true;
> +
> +       event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
> +       unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
> +
> +       for (i = 0; i < ARRAY_SIZE(arch_events); ++i) {
> +               if (arch_events[i].eventsel == event_select
> +                   && arch_events[i].unit_mask == unit_mask
> +                   && (pmu->available_event_types & (1 << i))) {
> +                       attr.config = arch_events[i].event_type;
> +                       break;
> +               }
> +       }
> +       if (i == ARRAY_SIZE(arch_events))
> +               return 1;
> +
> +       attr.exclude_user = !(eventsel & ARCH_PERFMON_EVENTSEL_USR);
> +       attr.exclude_kernel = !(eventsel & ARCH_PERFMON_EVENTSEL_OS);
> +
> +       if (eventsel & ARCH_PERFMON_EVENTSEL_EDGE)
> +               printk_once("kvm: pmu ignoring edge bit\n");
> +
> +       if (eventsel & ARCH_PERFMON_EVENTSEL_INT) {
> +               callback = kvm_perf_overflow;
> +               attr.disabled = true;
> +       }
> +
> +       inv = eventsel & ARCH_PERFMON_EVENTSEL_INV;
> +       cmask = (eventsel & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
> +
> +       pmc->eventsel = eventsel;
> +
> +       if (inv || cmask > 1) {
> +               printk_once("kvm: pmu ignoring difficult inv/cmask combo\n");
> +               return 0;
> +       }
> +
> +       attr.sample_period = (-pmc->counter) & pmu->counter_bitmask;
> +
> +       event = perf_event_create_kernel_counter(&attr, -1, current,
> +                                                callback, pmc);
> +       if (IS_ERR(event))
> +               return PTR_ERR(event);
> +
> +       if (callback)
> +               perf_event_refresh(event, 1);
> +
> +       pmc->perf_event = event;
> +       return 0;
> +}
> +
> +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
> +{
> +       struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +
> +       return get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
> +               || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0);
> +}
> +
> +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
> +{
> +       struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +       struct kvm_pmc *pmc;
> +
> +       if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0))) {
> +               *data = read_gp_pmc(pmu, pmc);
> +               return 0;
> +       } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
> +               *data = pmc->eventsel;
> +               return 0;
> +       }
> +       return 1;
> +}
> +
> +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
> +{
> +       struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +       struct kvm_pmc *pmc;
> +
> +       if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0))) {
> +               data = (s64)(s32)data;
> +               pmc->counter += data - read_gp_pmc(pmu, pmc);
> +               return 0;
> +       } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
> +               if (data == pmc->eventsel)
> +                       return 0;
> +               if (data & 0xffffffff00200000ULL)
> +                       return 1;
> +               return reprogram_gp_counter(pmu, pmc, data);
> +       }
> +       return 1;
> +}
> +
> +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
> +{
> +       struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +       bool fast_mode = pmc & (1u << 31);
> +       u64 ctr;
> +
> +       pmc &= (1u << 31) - 1;
> +       if (pmc >= pmu->nr_arch_gp_counters)
> +               return 1;
> +       ctr = read_gp_pmc(pmu, &pmu->gp_counters[pmc]);
> +       if (fast_mode)
> +               ctr = (u32)ctr;
> +       *data = ctr;
> +
> +       return 0;
> +}
> +
> +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
> +{
> +       struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +       struct kvm_cpuid_entry2 *entry;
> +       unsigned bitmap_len;
> +
> +       pmu->nr_arch_gp_counters = 0;
> +       pmu->version = 0;
> +       entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
> +       if (!entry)
> +               return;
> +       pmu->version = entry->eax & 0xff;
> +       pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
> +                                      KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS);
> +       pmu->counter_bitmask = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
> +       bitmap_len = (entry->eax >> 24) & 0xff;
> +       pmu->available_event_types = ~entry->ebx & ((1ULL << bitmap_len) - 1);
> +}
> +
> +void kvm_pmu_init(struct kvm_vcpu *vcpu)
> +{
> +       int i;
> +       struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +
> +       memset(pmu, 0, sizeof(*pmu));
> +       for (i = 0; i < KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS; ++i)
> +               pmu->gp_counters[i].vcpu = vcpu;
> +       init_irq_work(&pmu->irq_work, __kvm_perf_overflow);
> +       kvm_pmu_cpuid_update(vcpu);
> +}
> +
> +void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
> +{
> +       struct kvm_pmu *pmu = &vcpu->arch.pmu;
> +       struct kvm_pmc *pmc;
> +       int i;
> +
> +       irq_work_sync(&pmu->irq_work);
> +       for (i = 0; i < KVM_PMU_MAX_GENERAL_PURPOSE_COUNTERS; ++i) {
> +               pmc = &pmu->gp_counters[i];
> +               if (pmc->perf_event)
> +                       perf_event_release_kernel(pmc->perf_event);
> +       }
> +}
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 84f4607..258769f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -602,6 +602,8 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
>                if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
>                        best->ecx |= bit(X86_FEATURE_OSXSAVE);
>        }
> +
> +       kvm_pmu_cpuid_update(vcpu);
>  }
>
>  int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> @@ -1571,8 +1573,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>         * which we perfectly emulate ;-). Any other value should be at least
>         * reported, some guests depend on them.
>         */
> -       case MSR_P6_EVNTSEL0:
> -       case MSR_P6_EVNTSEL1:
>        case MSR_K7_EVNTSEL0:
>        case MSR_K7_EVNTSEL1:
>        case MSR_K7_EVNTSEL2:
> @@ -1584,8 +1584,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>        /* at least RHEL 4 unconditionally writes to the perfctr registers,
>         * so we ignore writes to make it happy.
>         */
> -       case MSR_P6_PERFCTR0:
> -       case MSR_P6_PERFCTR1:
>        case MSR_K7_PERFCTR0:
>        case MSR_K7_PERFCTR1:
>        case MSR_K7_PERFCTR2:
> @@ -1622,6 +1620,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
>        default:
>                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
>                        return xen_hvm_config(vcpu, data);
> +               if (kvm_pmu_msr(vcpu, msr))
> +                       return kvm_pmu_set_msr(vcpu, msr, data);
>                if (!ignore_msrs) {
>                        pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
>                                msr, data);
> @@ -1782,10 +1782,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>        case MSR_K8_SYSCFG:
>        case MSR_K7_HWCR:
>        case MSR_VM_HSAVE_PA:
> -       case MSR_P6_PERFCTR0:
> -       case MSR_P6_PERFCTR1:
> -       case MSR_P6_EVNTSEL0:
> -       case MSR_P6_EVNTSEL1:
>        case MSR_K7_EVNTSEL0:
>        case MSR_K7_PERFCTR0:
>        case MSR_K8_INT_PENDING_MSG:
> @@ -1887,6 +1883,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>                data = 0xbe702111;
>                break;
>        default:
> +               if (kvm_pmu_msr(vcpu, msr))
> +                       return kvm_pmu_get_msr(vcpu, msr, pdata);
>                if (!ignore_msrs) {
>                        pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
>                        return 1;
> @@ -6290,6 +6288,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>                goto fail_free_mce_banks;
>
>        kvm_async_pf_hash_reset(vcpu);
> +       kvm_pmu_init(vcpu);
>
>        return 0;
>  fail_free_mce_banks:
> @@ -6308,6 +6307,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
>  {
>        int idx;
>
> +       kvm_pmu_destroy(vcpu);
>        kfree(vcpu->arch.mce_banks);
>        kvm_free_lapic(vcpu);
>        idx = srcu_read_lock(&vcpu->kvm->srcu);
> --
> 1.7.5.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html