Regards, Anthony Liguori
Subject: [PATCH] KVM paravirt_ops core infrastructure Author: Anthony Liguori <aliguori@xxxxxxxxxx> This patch implements paravirt_ops support for KVM and updates the current paravirtualization support in KVM to match. Some changes to the previous paravirtualization support in KVM: 1) Theoritical support for SMP guests 2) Use CPUID to discover paravirtualization 3) Use feature bitmap instead of versioning Signed-off-by: Anthony Liguori <aliguori@xxxxxxxxxx> diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 8770a5d..97ad1e1 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -231,6 +231,13 @@ config VMI at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. +config KVM_GUEST + bool "KVM paravirt-ops support" + depends on PARAVIRT + help + This option enables various optimizations for running under the KVM + hypervisor. + config ACPI_SRAT bool default y diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 06da59f..12a4201 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_VMI) += vmi.o vmiclock.o +obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-y += pcspeaker.o diff --git a/arch/i386/kernel/kvm.c b/arch/i386/kernel/kvm.c new file mode 100644 index 0000000..04d564e --- /dev/null +++ b/arch/i386/kernel/kvm.c @@ -0,0 +1,222 @@ +/* + * KVM paravirt_ops implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@xxxxxxxxxx> + * Copyright IBM Corporation, 2007 + * Authors: Anthony Liguori <aliguori@xxxxxxxxxx> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/kvm_para.h> +#include <linux/cpu.h> +#include <linux/mm.h> + +struct kvm_paravirt_state +{ + struct kvm_vmca *vmca; + struct kvm_hypercall_entry *queue; + void (*hypercall)(void); + + u64 vmca_gpa; +}; + +static DEFINE_PER_CPU(struct kvm_paravirt_state *, paravirt_state); + +static int do_nop_io_delay; +static u64 msr_set_vmca; + +static long kvm_hypercall(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + struct kvm_paravirt_state *state + = per_cpu(paravirt_state, smp_processor_id()); + long ret; + + asm volatile("call *(%6) \n\t" + : "=a"(ret) + : "a" (nr), + "b" (p1), + "c" (p2), + "d" (p3), + "S" (p4), + "r" (&state->hypercall) + : "memory", "cc" + ); + + return ret; +} + +/* + * No need for any "IO delay" on KVM + */ +static void kvm_io_delay(void) +{ +} + +static void paravirt_ops_setup(void) +{ + paravirt_ops.name = "KVM"; + + if (do_nop_io_delay) + paravirt_ops.io_delay = kvm_io_delay; + + paravirt_ops.paravirt_enabled = 1; + + /* + * We call apply_paravirt again even though it's already been called + * for native. + */ + apply_paravirt(__parainstructions, __parainstructions_end); +} + +static void paravirt_activate(void *unused) +{ + struct kvm_paravirt_state *state + = per_cpu(paravirt_state, raw_smp_processor_id()); + wrmsrl(msr_set_vmca, state->vmca_gpa); +} + +static int paravirt_initialize(void) +{ + unsigned int eax, ebx, ecx, edx; + char signature[13]; + + /* verify that we're running on KVM */ + cpuid(CPUID_HYPE_IDENT, &eax, &ebx, &ecx, &edx); + memcpy(signature, &ebx, 4); + memcpy(signature + 4, &ecx, 4); + memcpy(signature + 8, &edx, 4); + signature[12] = 0; + + if (strcmp(signature, "KVMKVMKVMKVM")) + return -EINVAL; + + /* check what features are supported */ + cpuid(CPUID_HYPE_KVM_FEATURES, &eax, &ebx, &ecx, &edx); + msr_set_vmca = eax; + + /* no paravirtualization is supported */ + if (!(edx & KVM_FEATURE_VMCA)) + return -ENOSYS; + + if ((edx & KVM_FEATURE_NOP_IO_DELAY)) + do_nop_io_delay = 1; + + on_each_cpu(paravirt_activate, NULL, 0, 1); + + return 0; +} + +static __init void paravirt_free_state(struct kvm_paravirt_state *state) +{ + if (!state) + return; + + if (state->hypercall) + __free_page(pfn_to_page(__pa(state->hypercall) >> PAGE_SHIFT)); + + if (state->vmca) + __free_page(pfn_to_page(__pa(state->vmca) >> PAGE_SHIFT)); + + __free_page(pfn_to_page(__pa(state) >> PAGE_SHIFT)); +} + +static __init struct kvm_paravirt_state *paravirt_alloc_state(void) +{ + struct kvm_paravirt_state *state; + + state = (void *)get_zeroed_page(GFP_KERNEL); + if (!state) + goto err; + + state->vmca = (void *)get_zeroed_page(GFP_KERNEL); + if (!state->vmca) + goto err; + + /* FIXME: what do I need for this to be executable on 64 bit? */ + state->hypercall = (void *)get_zeroed_page(GFP_KERNEL); + if (!state->hypercall) + goto err; + + state->vmca_gpa = __pa(state->vmca); + state->vmca->hypercall_gpa = __pa(state->hypercall); + + return state; + + err: + paravirt_free_state(state); + return NULL; +} + +/* FIXME: hotplug hooks whenever KVM supports CPU hotplug */ + +static __init void paravirt_free_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kvm_paravirt_state *state; + state = per_cpu(paravirt_state, cpu); + paravirt_free_state(state); + } +} + +static __init int paravirt_alloc_area(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kvm_paravirt_state *state; + + state = paravirt_alloc_state(); + if (!state) + goto err; + + per_cpu(paravirt_state, cpu) = state; + } + + return 0; + + err: + paravirt_free_area(); + return -ENOMEM; +} + +static int __init kvm_guest_init(void) +{ + int rc; + + rc = paravirt_alloc_area(); + if (rc) + return rc; + + rc = paravirt_initialize(); + if (rc) + goto err; + + paravirt_ops_setup(); + + return rc; + + err: + paravirt_free_area(); + return rc; +} + +core_initcall(kvm_guest_init); diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index 633c2ed..1369310 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -43,6 +43,7 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/smp.h> +#include <linux/kvm_para.h> #include "x86_emulate.h" #include "segment_descriptor.h" @@ -91,6 +92,11 @@ struct vfsmount *kvmfs_mnt; #define CR8_RESEVED_BITS (~0x0fULL) #define EFER_RESERVED_BITS 0xfffffffffffff2fe +#define KVM_PARAVIRT_FEATURES \ + (KVM_FEATURE_VMCA | KVM_FEATURE_NOP_IO_DELAY) + +#define KVM_MSR_SET_VMCA 0x87655678 + #ifdef CONFIG_X86_64 // LDT or TSS descriptor in the GDT. 16 bytes. struct segment_descriptor_64 { @@ -1340,12 +1346,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +static int dispatch_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, + unsigned long p1, unsigned long p2, + unsigned long p3, unsigned long p4) +{ + return -KVM_ENOSYS; +} + int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) { unsigned long nr, a0, a1, a2, a3, a4, a5, ret; kvm_arch_ops->cache_regs(vcpu); - ret = -KVM_EINVAL; + #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { nr = vcpu->regs[VCPU_REGS_RAX]; @@ -1358,16 +1371,17 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) } else #endif { - nr = vcpu->regs[VCPU_REGS_RBX] & -1u; - a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; + nr = vcpu->regs[VCPU_REGS_RAX] & -1u; + a0 = vcpu->regs[VCPU_REGS_RBX] & -1u; a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; } - switch (nr) { - default: + + ret = dispatch_hypercall(vcpu, nr, a0, a1, a2, a3); + if (ret == -KVM_ENOSYS) { run->hypercall.args[0] = a0; run->hypercall.args[1] = a1; run->hypercall.args[2] = a2; @@ -1456,7 +1470,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, */ static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) { - struct kvm_vcpu_para_state *para_state; + struct kvm_vmca *para_state; hpa_t para_state_hpa, hypercall_hpa; struct page *para_state_page; unsigned char *hypercall; @@ -1476,30 +1490,14 @@ static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) if (is_error_hpa(para_state_hpa)) goto err_gp; - mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); para_state = kmap_atomic(para_state_page, KM_USER0); - printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); - printk(KERN_DEBUG ".... size: %d\n", para_state->size); - - para_state->host_version = KVM_PARA_API_VERSION; - /* - * We cannot support guests that try to register themselves - * with a newer API version than the host supports: - */ - if (para_state->guest_version > KVM_PARA_API_VERSION) { - para_state->ret = -KVM_EINVAL; - goto err_kunmap_skip; - } - hypercall_gpa = para_state->hypercall_gpa; hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); - if (is_error_hpa(hypercall_hpa)) { - para_state->ret = -KVM_EINVAL; + if (is_error_hpa(hypercall_hpa)) goto err_kunmap_skip; - } printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); vcpu->para_state_page = para_state_page; @@ -1512,7 +1510,6 @@ static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) kvm_arch_ops->patch_hypercall(vcpu, hypercall); kunmap_atomic(hypercall, KM_USER1); - para_state->ret = 0; err_kunmap_skip: kunmap_atomic(para_state, KM_USER0); return 0; @@ -1633,12 +1630,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->ia32_misc_enable_msr = data; break; - /* - * This is the 'probe whether the host is KVM' logic: - */ - case MSR_KVM_API_MAGIC: - return vcpu_register_para(vcpu, data); - + case KVM_MSR_SET_VMCA: + vcpu_register_para(vcpu, data); + break; default: printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr); return 1; @@ -1693,6 +1687,20 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_arch_ops->cache_regs(vcpu); function = vcpu->regs[VCPU_REGS_RAX]; + + if (function == CPUID_HYPE_IDENT) { + vcpu->regs[VCPU_REGS_RAX] = 0; + /* KVMKVMKVMKVM */ + vcpu->regs[VCPU_REGS_RBX] = 0x4b4d564b; + vcpu->regs[VCPU_REGS_RCX] = 0x564b4d56; + vcpu->regs[VCPU_REGS_RDX] = 0x4d564b4d; + goto out; + } else if (function == CPUID_HYPE_KVM_FEATURES) { + vcpu->regs[VCPU_REGS_RAX] = KVM_MSR_SET_VMCA; + vcpu->regs[VCPU_REGS_RDX] = KVM_PARAVIRT_FEATURES; + goto out; + } + vcpu->regs[VCPU_REGS_RAX] = 0; vcpu->regs[VCPU_REGS_RBX] = 0; vcpu->regs[VCPU_REGS_RCX] = 0; @@ -1717,6 +1725,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) vcpu->regs[VCPU_REGS_RCX] = best->ecx; vcpu->regs[VCPU_REGS_RDX] = best->edx; } + out: kvm_arch_ops->decache_regs(vcpu); kvm_arch_ops->skip_emulated_instruction(vcpu); } diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index 3b29256..11ebad8 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h @@ -1,6 +1,8 @@ #ifndef __LINUX_KVM_PARA_H #define __LINUX_KVM_PARA_H +#include <linux/errno.h> + /* * Guest OS interface for KVM paravirtualization * @@ -8,66 +10,28 @@ * as we make progress. */ -/* - * Per-VCPU descriptor area shared between guest and host. Writable to - * both guest and host. Registered with the host by the guest when - * a guest acknowledges paravirtual mode. - * - * NOTE: all addresses are guest-physical addresses (gpa), to make it - * easier for the hypervisor to map between the various addresses. - */ -struct kvm_vcpu_para_state { - /* - * API version information for compatibility. If there's any support - * mismatch (too old host trying to execute too new guest) then - * the host will deny entry into paravirtual mode. Any other - * combination (new host + old guest and new host + new guest) - * is supposed to work - new host versions will support all old - * guest API versions. - */ - u32 guest_version; - u32 host_version; - u32 size; - u32 ret; - - /* - * The address of the vm exit instruction (VMCALL or VMMCALL), - * which the host will patch according to the CPU model the - * VM runs on: - */ - u64 hypercall_gpa; - -} __attribute__ ((aligned(PAGE_SIZE))); +#define CPUID_HYPE_IDENT 0x40000000 +#define CPUID_HYPE_KVM_FEATURES 0x40000001 -#define KVM_PARA_API_VERSION 1 +#define KVM_FEATURE_VMCA (1UL << 0) +#define KVM_FEATURE_NOP_IO_DELAY (1UL << 1) -/* - * This is used for an RDMSR's ECX parameter to probe for a KVM host. - * Hopefully no CPU vendor will use up this number. This is placed well - * out of way of the typical space occupied by CPU vendors' MSR indices, - * and we think (or at least hope) it wont be occupied in the future - * either. - */ -#define MSR_KVM_API_MAGIC 0x87655678 - -#define KVM_EINVAL 1 +struct kvm_vmca +{ + u64 hypercall_gpa; +}; /* * Hypercall calling convention: * - * Each hypercall may have 0-6 parameters. + * Each hypercall may have 0-4 parameters. * - * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1 - * - * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention - * order: RDI, RSI, RDX, RCX, R8, R9. - * - * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP. - * (the first 3 are according to the gcc regparm calling convention) + * 32-bit index is EAX, parameters are: EBX, ECX, EDX, ESI. * * No registers are clobbered by the hypercall, except that the * return value is in RAX. */ -#define __NR_hypercalls 0 + +#define KVM_ENOSYS ENOSYS #endif
_______________________________________________ Virtualization mailing list Virtualization@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/virtualization