Some Code for Performance Profiling

Jiaqing Du <jiaqing@xxxxxxxxx> · Wed, 31 Mar 2010 18:53:15 +0200

Hi,

We have some code about performance profiling in KVM. They are outputs
of a school project. Previous discussions in KVM, Perfmon2, and Xen
mailing lists helped us a lot. The code are NOT in a good shape and
are only used to demonstrated the feasibility of doing performance
profiling in KVM. Feel free to use it if you want.

We categorize performance profiling in a virtualized environment into
two types: *guest-wide profiling* and *system-wide profiling*. For
guest-wide profiling, only the guest is profiled. KVM virtualizes the
PMU and the user runs a profiler directly in the guest. It requires no
modifications to the guest OS and the profiler running in the guest.
For system-wide profiling, both KVM and the guest OS are profiled. The
results are similar to what XenOprof outputs. In this case, one
profiler running in the host and one profiler running in the guest.
Still it requires no modifications to the guest and the profiler
running in it.

For guest-wide profiling, there are two possible places to save and
restore the related MSRs. One is where the CPU switches between guest
mode and host mode. We call this *CPU-switch*. Profiling with this
enabled reflects how the guest behaves on the physical CPU, plus other
virtualized, not emulated, devices. The other place is where the CPU
switches between the KVM context and others. Here KVM context means
the CPU is executing guest code or KVM code, both kernel space and
user space. We call this *domain-switch*. Profiling with this enabled
discloses how the guest behaves on both the physical CPU and KVM.
(Some emulated operations are really expensive in a virtualized
environment.)

More details can be found at http://jiaqing.org/download/profiling_kvm.tgz


=============Guest-wide profiling with domain-switch, for
Linux-2.6.32==================

diff --git a/arch/x86/include/asm/thread_info.h
b/arch/x86/include/asm/thread_info.h
index d27d0a2..b749b5d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -96,6 +96,7 @@ struct thread_info {
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
+#define TIF_VPMU_CTXSW      29  /* KVM thread tag */

 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -119,6 +120,7 @@ struct thread_info {
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
+#define _TIF_VPMU_CTXSW         (1 << TIF_VPMU_CTXSW)

 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
@@ -146,8 +148,9 @@ struct thread_info {

 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
-
+	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC|   \
+     _TIF_VPMU_CTXSW)
+
 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2..d5269d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -178,6 +178,53 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }

+static const u32 vmx_pmu_msr_index[] = {
+  MSR_P6_EVNTSEL0, MSR_P6_EVNTSEL1, MSR_P6_PERFCTR0, MSR_P6_PERFCTR1,
+};
+#define NR_VMX_PMU_MSR ARRAY_SIZE(vmx_pmu_msr_index)
+static u64 vpmu_msr_list[NR_VMX_PMU_MSR];
+
+static void vpmu_load_msrs(u64 *msr_list)
+{
+    u64 *p = msr_list;
+    int i;
+
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    wrmsrl(vmx_pmu_msr_index[i], *p);
+	    p++;
+	}
+}
+
+static void vpmu_save_msrs(u64 *msr_list)
+{
+    u64 *p = msr_list;
+    int i;
+
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    rdmsrl(vmx_pmu_msr_index[i], *p);
+	    p++;
+	}
+}
+
+#define P6_EVENTSEL0_ENABLE     (1 << 22)
+static void enable_perf(void)
+{
+    u64 val;
+
+    rdmsrl(MSR_P6_EVNTSEL0, val);
+    val |= P6_EVENTSEL0_ENABLE;
+    wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void disable_perf(void)
+{
+    u64 val;
+
+    rdmsrl(MSR_P6_EVNTSEL0, val);
+    val &= ~P6_EVENTSEL0_ENABLE;
+    wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		      struct tss_struct *tss)
 {
@@ -186,6 +233,21 @@ void __switch_to_xtra(struct task_struct *prev_p,
struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;

+    if (test_tsk_thread_flag(prev_p, TIF_VPMU_CTXSW) &&
+            test_tsk_thread_flag(next_p, TIF_VPMU_CTXSW)) {
+        /* do nothing, still in KVM context */
+    } else {
+        if (test_tsk_thread_flag(prev_p, TIF_VPMU_CTXSW)) {
+            disable_perf();
+            vpmu_save_msrs(vpmu_msr_list);
+        }
+
+        if (test_tsk_thread_flag(next_p, TIF_VPMU_CTXSW)) {
+            vpmu_load_msrs(vpmu_msr_list);
+            enable_perf();
+        }
+    }
+
 	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
 	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
 		ds_switch_to(prev_p, next_p);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42..4f4ff86 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -34,6 +34,7 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
+#include <linux/kdebug.h>

 #include "trace.h"

@@ -127,6 +128,7 @@ static u64 construct_eptp(unsigned long root_hpa);
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct kvm_vcpu *, cur_exit_vcpu);

 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
@@ -3603,6 +3605,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+    int cpu = raw_smp_processor_id();

 	if (enable_ept && is_paging(vcpu)) {
 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -3639,6 +3642,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
 	if (vcpu->arch.switch_db_regs)
 		set_debugreg(vcpu->arch.dr6, 6);

+    /* record the exited vcpu */
+    per_cpu(cur_exit_vcpu, cpu) = vcpu;
+
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -3985,6 +3991,43 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.gb_page_enable = vmx_gb_page_enable,
 };

+static void guest_set_apic(void *info)
+{
+    unsigned int v;
+
+    v = apic_read(APIC_LVTERR);
+    apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+    apic_write(APIC_LVTPC, APIC_DM_NMI);
+    apic_write(APIC_LVTERR, v);
+}
+
+static int vmx_vcpu_nmi_notify(struct notifer_block *self,
+                   unsigned long val, void *data)
+{
+    int cpu = raw_smp_processor_id();
+    struct kvm_vcpu *vcpu = per_cpu(cur_exit_vcpu, cpu);
+    int ret = NOTIFY_DONE;
+
+    switch (val) {
+    case DIE_NMI:
+    case DIE_NMI_IPI:
+            guest_set_apic(NULL);
+            vcpu->cntr_overflow = 1;
+            vcpu->nmi_nr++;
+            ret = NOTIFY_STOP;
+            break;
+    default:
+            break;
+    }
+    return ret;
+}
+
+static struct notifier_block vmx_vcpu_nb = {
+        .notifier_call = vmx_vcpu_nmi_notify,
+        .next = NULL,
+        .priority = 3
+};
+
 static int __init vmx_init(void)
 {
 	int r;
@@ -4036,6 +4079,17 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);

+	vmx_disable_intercept_for_msr(MSR_P6_PERFCTR0, false);
+	vmx_disable_intercept_for_msr(MSR_P6_PERFCTR1, false);
+	vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL0, false);
+	vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL1, false);
+
+    if (register_die_notifier(&vmx_vcpu_nb)) {
+        printk(KERN_ALERT "[hw_vpmu]: Register NMI handler failed..\n");
+    } else {
+        printk(KERN_ALERT "[hw_vpmu]: Register NMI handler succeeded..\n");
+    }
+
 	if (enable_ept) {
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
@@ -4071,6 +4125,9 @@ static void __exit vmx_exit(void)
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);

+    unregister_die_notifier(&vmx_vcpu_nb);
+	printk(KERN_ALERT "[hw_vpmu]: Remove NMI handler module..\n");
+
 	kvm_exit();
 }

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae07d26..1abedb4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3615,6 +3615,11 @@ static int vcpu_enter_guest(struct kvm_vcpu
*vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}

+    if (vcpu->cntr_overflow) {
+            vcpu->arch.nmi_pending = 1;
+            vcpu->cntr_overflow = 0;
+    }
+
 	inject_pending_event(vcpu, kvm_run);

 	/* enable NMI/IRQ window open exits if needed */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7bbb5d..96d63d1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -99,6 +99,9 @@ struct kvm_vcpu {
 	gpa_t mmio_phys_addr;
 #endif

+    int cntr_overflow;
+    int nmi_nr;
+
 	struct kvm_vcpu_arch arch;
 };

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index cf24c20..b0942c1 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -225,6 +225,9 @@ extern int flush_work(struct work_struct *work);

 extern int cancel_work_sync(struct work_struct *work);

+extern struct task_struct * thread_of_workqueue(struct workqueue_struct *wq,
+        int cpu);
+
 /*
  * Kill off a pending schedule_delayed_work().  Note that the work callback
  * function may still be running on return from cancel_delayed_work(), unless
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b..5eb9503 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -150,6 +150,15 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 	spin_unlock_irqrestore(&cwq->lock, flags);
 }

+struct task_struct * thread_of_workqueue(struct workqueue_struct *wq,
+        int cpu)
+{
+    struct cpu_workqueue_struct *cwq = wq_per_cpu(wq, cpu);
+
+    return cwq->thread;
+}
+EXPORT_SYMBOL_GPL(thread_of_workqueue);
+
 /**
  * queue_work - queue work on a workqueue
  * @wq: workqueue to use
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index bb4ebd8..33b5da8 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -318,10 +318,18 @@ kvm_irqfd_release(struct kvm *kvm)
  */
 static int __init irqfd_module_init(void)
 {
+    int cpu = raw_smp_processor_id();
+    struct task_struct *thread;
+
 	irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
 	if (!irqfd_cleanup_wq)
 		return -ENOMEM;

+    thread = thread_of_workqueue(irqfd_cleanup_wq, cpu);
+    set_tsk_thread_flag(thread, TIF_VPMU_CTXSW);
+    printk(KERN_ALERT "[hw_vpmu]: monitored irqfd thread id = %d\n",
+            (int)thread->pid);
+
 	return 0;
 }

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7495ce3..355bff5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1809,6 +1809,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm
*kvm, u32 id)
 		kvm->bsp_vcpu = vcpu;
 #endif
 	mutex_unlock(&kvm->lock);
+
+    set_tsk_thread_flag(current, TIF_VPMU_CTXSW);
+    printk(KERN_ALERT "[hw_vpmu]: monitored vcpu thread id = %d\n",
+            (int)current->pid);
+
 	return r;

 vcpu_destroy:
@@ -2360,6 +2365,10 @@ static int kvm_dev_ioctl_create_vm(void)
 	if (fd < 0)
 		kvm_put_kvm(kvm);

+    set_tsk_thread_flag(current, TIF_VPMU_CTXSW);
+    printk(KERN_ALERT "[hw_vpmu]: monitored main thread id = %d\n",
+            (int)current->pid);
+
 	return fd;
 }



=============Guest-wide profiling with cpu-switch, for
Linux-2.6.32==================

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42..970b5ab 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -34,6 +34,7 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
+#include <linux/kdebug.h>

 #include "trace.h"

@@ -114,6 +115,9 @@ struct vcpu_vmx {
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
+
+    unsigned long *msr_host_load_store;
+    unsigned long *msr_guest_load_store;
 };

 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -127,12 +131,18 @@ static u64 construct_eptp(unsigned long root_hpa);
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct kvm_vcpu *, cur_exit_vcpu);

 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;

+static const u32 vmx_pmu_msr_index[] = {
+  MSR_P6_EVNTSEL0, MSR_P6_EVNTSEL1, MSR_P6_PERFCTR0, MSR_P6_PERFCTR1,
+};
+#define NR_VMX_PMU_MSR ARRAY_SIZE(vmx_pmu_msr_index)
+
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);

@@ -2272,6 +2282,14 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
 	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));

+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, NR_VMX_PMU_MSR);
+	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_guest_load_store));
+	
+	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, NR_VMX_PMU_MSR);
+	vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_guest_load_store));
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, NR_VMX_PMU_MSR);
+	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_host_load_store));
+
 	if (cpu_has_vmx_msr_bitmap())
 		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));

@@ -2340,9 +2358,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)

 	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
 	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
-	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
-	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
-	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);

 	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
 	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -3600,9 +3615,34 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 #define Q "l"
 #endif

+static void guest_set_apic(void *info)
+{
+    unsigned int v;
+
+    v = apic_read(APIC_LVTERR);
+    apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
+    apic_write(APIC_LVTPC, APIC_DM_NMI);
+    apic_write(APIC_LVTERR, v);
+}
+
+static void save_host_msrs(struct vcpu_vmx *vmx)
+{
+    u32 *p;
+    int i;
+
+	p = (u32 *)vmx->msr_host_load_store;
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    *p = vmx_pmu_msr_index[i];
+	    p += 2;
+	    rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+	    p += 2;
+	}
+}
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+    int cpu = raw_smp_processor_id();

 	if (enable_ept && is_paging(vcpu)) {
 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
@@ -3639,6 +3679,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
 	if (vcpu->arch.switch_db_regs)
 		set_debugreg(vcpu->arch.dr6, 6);

+    /* record the exited vcpu */
+    per_cpu(cur_exit_vcpu, cpu) = vcpu;
+
+    /* The guest counters are reloaded by the hardware later. */
+    save_host_msrs(vmx);
+
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -3750,6 +3796,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
 	vmx->launched = 1;

 	vmx_complete_interrupts(vmx);
+
+    /* always clear LVTPC bit */
+    guest_set_apic(NULL);
+
 }

 #undef R
@@ -3766,6 +3816,59 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 	}
 }

+static int vmx_create_vpmu_msrs(struct kvm_vcpu *vcpu)
+{
+    int i, r = 0;
+    u32 *p;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	vmx->msr_host_load_store = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx->msr_host_load_store) {
+	    r = -ENOMEM;
+	}
+	
+	vmx->msr_guest_load_store = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx->msr_guest_load_store) {
+	    r = -ENOMEM;
+	    goto free_msr_host;
+	}
+	
+	memset(vmx->msr_host_load_store, 0x00, PAGE_SIZE);
+	memset(vmx->msr_guest_load_store, 0x00, PAGE_SIZE);
+
+	/* Initialize load&store memory area. Use the contents of host MSRs as
+     * initial values.. */
+	p = (u32 *)vmx->msr_host_load_store;
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    *p = vmx_pmu_msr_index[i];
+	    p += 2;
+	    rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+	    p += 2;
+	}
+	
+	p = (u32 *)vmx->msr_guest_load_store;
+	for (i = 0; i < NR_VMX_PMU_MSR; ++i) {
+	    *p = vmx_pmu_msr_index[i];
+	    p += 2;
+	    rdmsrl(vmx_pmu_msr_index[i], *((u64 *)p));
+	    p += 2;
+	}
+
+    return r;
+
+free_msr_host:
+    free_page((unsigned long)vmx->msr_host_load_store);
+    return r;
+}
+
+static void vmx_free_vpmu_msrs(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	free_page((unsigned long)vmx->msr_host_load_store);
+	free_page((unsigned long)vmx->msr_guest_load_store);
+}
+
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3777,6 +3880,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->host_msrs);
 	kfree(vmx->guest_msrs);
+    vmx_free_vpmu_msrs(vcpu);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
@@ -3812,6 +3916,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct
kvm *kvm, unsigned int id)

 	vmcs_clear(vmx->vmcs);

+    if (vmx_create_vpmu_msrs(&vmx->vcpu))
+        goto free_vmcs;
+
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
 	err = vmx_vcpu_setup(vmx);
@@ -3985,6 +4092,33 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.gb_page_enable = vmx_gb_page_enable,
 };

+static int vmx_vcpu_nmi_notify(struct notifer_block *self,
+                   unsigned long val, void *data)
+{
+	    int cpu = raw_smp_processor_id();
+        struct kvm_vcpu *vcpu = per_cpu(cur_exit_vcpu, cpu);
+        int ret = NOTIFY_DONE;
+
+        switch (val) {
+        case DIE_NMI:
+        case DIE_NMI_IPI:
+                guest_set_apic(NULL);
+                vcpu->cntr_overflow = 1;
+                vcpu->nmi_nr++;
+                ret = NOTIFY_STOP;
+                break;
+        default:
+                break;
+        }
+        return ret;
+}
+
+static struct notifier_block vmx_vcpu_nb = {
+        .notifier_call = vmx_vcpu_nmi_notify,
+        .next = NULL,
+        .priority = 3
+};
+
 static int __init vmx_init(void)
 {
 	int r;
@@ -4036,6 +4170,17 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);

+	vmx_disable_intercept_for_msr(MSR_P6_PERFCTR0, false);
+	vmx_disable_intercept_for_msr(MSR_P6_PERFCTR1, false);
+	vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL0, false);
+	vmx_disable_intercept_for_msr(MSR_P6_EVNTSEL1, false);
+
+    if (register_die_notifier(&vmx_vcpu_nb)) {
+        printk(KERN_ALERT "[hw_vpmu]: Register NMI handler failed..\n");
+    } else {
+        printk(KERN_ALERT "[hw_vpmu]: Register NMI handler succeeded..\n");
+    }
+
 	if (enable_ept) {
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
@@ -4071,6 +4216,9 @@ static void __exit vmx_exit(void)
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);

+    unregister_die_notifier(&vmx_vcpu_nb);
+	printk(KERN_ALERT "[hw_vpmu]: Remove NMI handler module..\n");
+
 	kvm_exit();
 }

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae07d26..1abedb4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3615,6 +3615,11 @@ static int vcpu_enter_guest(struct kvm_vcpu
*vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}

+    if (vcpu->cntr_overflow) {
+            vcpu->arch.nmi_pending = 1;
+            vcpu->cntr_overflow = 0;
+    }
+
 	inject_pending_event(vcpu, kvm_run);

 	/* enable NMI/IRQ window open exits if needed */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b7bbb5d..96d63d1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -99,6 +99,9 @@ struct kvm_vcpu {
 	gpa_t mmio_phys_addr;
 #endif

+    int cntr_overflow;
+    int nmi_nr;
+
 	struct kvm_vcpu_arch arch;
 };
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html