On Tue, Jul 3, 2012 at 12:07 PM, Avi Kivity <avi@xxxxxxxxxx> wrote: > On 07/03/2012 12:01 PM, Christoffer Dall wrote: >> Provides complete world-switch implementation to switch to other guests >> running in non-secure modes. Includes Hyp exception handlers that >> capture necessary exception information and stores the information on >> the VCPU and KVM structures. >> >> The following Hyp-ABI is also documented in the code: >> >> Hyp-ABI: Switching from host kernel to Hyp-mode: >> Switching to Hyp mode is done through a simple HVC instructions. The >> exception vector code will check that the HVC comes from VMID==0 and if >> so will store the necessary state on the Hyp stack, which will look like >> this (growing downwards, see the hyp_hvc handler): >> ... >> stack_page + 4: spsr (Host-SVC cpsr) >> stack_page : lr_usr >> --------------: stack bottom >> >> Hyp-ABI: Switching from Hyp-mode to host kernel SVC mode: >> When returning from Hyp mode to SVC mode, another HVC instruction is >> executed from Hyp mode, which is taken in the hyp_svc handler. The >> bottom of the Hyp is derived from the Hyp stack pointer (only a single >> page aligned stack is used per CPU) and the initial SVC registers are >> used to restore the host state. >> >> Otherwise, the world-switch is pretty straight-forward. All state that >> can be modified by the guest is first backed up on the Hyp stack and the >> VCPU values is loaded onto the hardware. State, which is not loaded, but >> theoretically modifiable by the guest is protected through the >> virtualiation features to generate a trap and cause software emulation. >> Upon guest returns, all state is restored from hardware onto the VCPU >> struct and the original state is restored from the Hyp-stack onto the >> hardware. >> >> One controversy may be the back-door call to __irq_svc (the host >> kernel's own physical IRQ handler) which is called when a physical IRQ >> exception is taken in Hyp mode while running in the guest. >> >> SMP support using the VMPIDR calculated on the basis of the host MPIDR >> and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier. > > He should sign off on this patch then. > >> >> Reuse of VMIDs has been implemented by Antonios Motakis and adapated from >> a separate patch into the appropriate patches introducing the >> functionality. Note that the VMIDs are stored per VM as required by the ARM >> architecture reference manual. > > Ditto. > >> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h >> index 220f241..232117c 100644 >> --- a/arch/arm/include/asm/kvm_arm.h >> +++ b/arch/arm/include/asm/kvm_arm.h >> @@ -105,6 +105,17 @@ >> #define TTBCR_T0SZ 3 >> #define HTCR_MASK (TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0) >> >> +/* Hyp System Trap Register */ >> +#define HSTR_T(x) (1 << x) >> +#define HSTR_TTEE (1 << 16) >> +#define HSTR_TJDBX (1 << 17) >> + >> +/* Hyp Coprocessor Trap Register */ >> +#define HCPTR_TCP(x) (1 << x) >> +#define HCPTR_TCP_MASK (0x3fff) >> +#define HCPTR_TASE (1 << 15) >> +#define HCPTR_TTA (1 << 20) >> +#define HCPTR_TCPAC (1 << 31) >> >> /* Virtualization Translation Control Register (VTCR) bits */ >> #define VTCR_SH0 (3 << 12) >> @@ -126,5 +137,31 @@ >> #define VTTBR_X (5 - VTCR_GUEST_T0SZ) >> #endif >> >> +/* Hyp Syndrome Register (HSR) bits */ >> +#define HSR_EC_SHIFT (26) >> +#define HSR_EC (0x3fU << HSR_EC_SHIFT) >> +#define HSR_IL (1U << 25) >> +#define HSR_ISS (HSR_IL - 1) >> +#define HSR_ISV_SHIFT (24) >> +#define HSR_ISV (1U << HSR_ISV_SHIFT) >> + >> +#define HSR_EC_UNKNOWN (0x00) >> +#define HSR_EC_WFI (0x01) >> +#define HSR_EC_CP15_32 (0x03) >> +#define HSR_EC_CP15_64 (0x04) >> +#define HSR_EC_CP14_MR (0x05) >> +#define HSR_EC_CP14_LS (0x06) >> +#define HSR_EC_CP_0_13 (0x07) >> +#define HSR_EC_CP10_ID (0x08) >> +#define HSR_EC_JAZELLE (0x09) >> +#define HSR_EC_BXJ (0x0A) >> +#define HSR_EC_CP14_64 (0x0C) >> +#define HSR_EC_SVC_HYP (0x11) >> +#define HSR_EC_HVC (0x12) >> +#define HSR_EC_SMC (0x13) >> +#define HSR_EC_IABT (0x20) >> +#define HSR_EC_IABT_HYP (0x21) >> +#define HSR_EC_DABT (0x24) >> +#define HSR_EC_DABT_HYP (0x25) >> >> #endif /* __ARM_KVM_ARM_H__ */ >> diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c >> index b57c75e..38d3a12 100644 >> --- a/arch/arm/kernel/armksyms.c >> +++ b/arch/arm/kernel/armksyms.c >> @@ -48,6 +48,13 @@ extern void __aeabi_ulcmp(void); >> >> extern void fpundefinstr(void); >> >> +#ifdef CONFIG_KVM_ARM_HOST >> +/* This is needed for KVM */ >> +extern void __irq_svc(void); >> + >> +EXPORT_SYMBOL_GPL(__irq_svc); >> +#endif >> + >> /* platform dependent support */ >> EXPORT_SYMBOL(__udelay); >> EXPORT_SYMBOL(__const_udelay); >> diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c >> index 1429d89..9c76b53 100644 >> --- a/arch/arm/kernel/asm-offsets.c >> +++ b/arch/arm/kernel/asm-offsets.c >> @@ -13,6 +13,7 @@ >> #include <linux/sched.h> >> #include <linux/mm.h> >> #include <linux/dma-mapping.h> >> +#include <linux/kvm_host.h> >> #include <asm/cacheflush.h> >> #include <asm/glue-df.h> >> #include <asm/glue-pf.h> >> @@ -144,5 +145,47 @@ int main(void) >> DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); >> DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); >> DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); >> +#ifdef CONFIG_KVM_ARM_HOST >> + DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); >> + DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.cp15[c0_MIDR])); >> + DEFINE(VCPU_MPIDR, offsetof(struct kvm_vcpu, arch.cp15[c0_MPIDR])); >> + DEFINE(VCPU_SCTLR, offsetof(struct kvm_vcpu, arch.cp15[c1_SCTLR])); >> + DEFINE(VCPU_CPACR, offsetof(struct kvm_vcpu, arch.cp15[c1_CPACR])); >> + DEFINE(VCPU_TTBR0, offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR0])); >> + DEFINE(VCPU_TTBR1, offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR1])); >> + DEFINE(VCPU_TTBCR, offsetof(struct kvm_vcpu, arch.cp15[c2_TTBCR])); >> + DEFINE(VCPU_DACR, offsetof(struct kvm_vcpu, arch.cp15[c3_DACR])); >> + DEFINE(VCPU_DFSR, offsetof(struct kvm_vcpu, arch.cp15[c5_DFSR])); >> + DEFINE(VCPU_IFSR, offsetof(struct kvm_vcpu, arch.cp15[c5_IFSR])); >> + DEFINE(VCPU_ADFSR, offsetof(struct kvm_vcpu, arch.cp15[c5_ADFSR])); >> + DEFINE(VCPU_AIFSR, offsetof(struct kvm_vcpu, arch.cp15[c5_AIFSR])); >> + DEFINE(VCPU_DFAR, offsetof(struct kvm_vcpu, arch.cp15[c6_DFAR])); >> + DEFINE(VCPU_IFAR, offsetof(struct kvm_vcpu, arch.cp15[c6_IFAR])); >> + DEFINE(VCPU_PRRR, offsetof(struct kvm_vcpu, arch.cp15[c10_PRRR])); >> + DEFINE(VCPU_NMRR, offsetof(struct kvm_vcpu, arch.cp15[c10_NMRR])); >> + DEFINE(VCPU_VBAR, offsetof(struct kvm_vcpu, arch.cp15[c12_VBAR])); >> + DEFINE(VCPU_CID, offsetof(struct kvm_vcpu, arch.cp15[c13_CID])); >> + DEFINE(VCPU_TID_URW, offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URW])); >> + DEFINE(VCPU_TID_URO, offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URO])); >> + DEFINE(VCPU_TID_PRIV, offsetof(struct kvm_vcpu, arch.cp15[c13_TID_PRIV])); >> + DEFINE(VCPU_REGS, offsetof(struct kvm_vcpu, arch.regs)); >> + DEFINE(VCPU_USR_REGS, offsetof(struct kvm_vcpu, arch.regs.usr_regs)); >> + DEFINE(VCPU_SVC_REGS, offsetof(struct kvm_vcpu, arch.regs.svc_regs)); >> + DEFINE(VCPU_ABT_REGS, offsetof(struct kvm_vcpu, arch.regs.abt_regs)); >> + DEFINE(VCPU_UND_REGS, offsetof(struct kvm_vcpu, arch.regs.und_regs)); >> + DEFINE(VCPU_IRQ_REGS, offsetof(struct kvm_vcpu, arch.regs.irq_regs)); >> + DEFINE(VCPU_FIQ_REGS, offsetof(struct kvm_vcpu, arch.regs.fiq_regs)); >> + DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.regs.pc)); >> + DEFINE(VCPU_CPSR, offsetof(struct kvm_vcpu, arch.regs.cpsr)); >> + DEFINE(VCPU_IRQ_LINES, offsetof(struct kvm_vcpu, arch.irq_lines)); >> + DEFINE(VCPU_HSR, offsetof(struct kvm_vcpu, arch.hsr)); >> + DEFINE(VCPU_HDFAR, offsetof(struct kvm_vcpu, arch.hdfar)); >> + DEFINE(VCPU_HIFAR, offsetof(struct kvm_vcpu, arch.hifar)); >> + DEFINE(VCPU_HPFAR, offsetof(struct kvm_vcpu, arch.hpfar)); >> + DEFINE(VCPU_PC_IPA, offsetof(struct kvm_vcpu, arch.pc_ipa)); >> + DEFINE(VCPU_PC_IPA2, offsetof(struct kvm_vcpu, arch.pc_ipa2)); >> + DEFINE(VCPU_HYP_PC, offsetof(struct kvm_vcpu, arch.hyp_pc)); >> + DEFINE(KVM_VTTBR, offsetof(struct kvm, arch.vttbr)); >> +#endif >> return 0; >> } >> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S >> index 437f0c4..db029bb 100644 >> --- a/arch/arm/kernel/entry-armv.S >> +++ b/arch/arm/kernel/entry-armv.S >> @@ -209,6 +209,7 @@ __dabt_svc: >> ENDPROC(__dabt_svc) >> >> .align 5 >> + .globl __irq_svc >> __irq_svc: >> svc_entry >> irq_handler >> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c >> index 8b024ee..4687690 100644 >> --- a/arch/arm/kvm/arm.c >> +++ b/arch/arm/kvm/arm.c >> @@ -37,12 +37,19 @@ >> #include <asm/mman.h> >> #include <asm/idmap.h> >> #include <asm/tlbflush.h> >> +#include <asm/cputype.h> >> #include <asm/kvm_arm.h> >> #include <asm/kvm_asm.h> >> #include <asm/kvm_mmu.h> >> +#include <asm/kvm_emulate.h> >> >> static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); >> >> +/* The VMID used in the VTTBR */ >> +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); >> +static u8 kvm_next_vmid; >> +DEFINE_SPINLOCK(kvm_vmid_lock); > > static, too. > >> + >> + >> +/** >> + * check_new_vmid_gen - check that the VMID is still valid >> + * @kvm: The VM's VMID to checkt >> + * >> + * return true if there is a new generation of VMIDs being used >> + * >> + * The hardware supports only 256 values with the value zero reserved for the >> + * host, so we check if an assigned value belongs to a previous generation, >> + * which which requires us to assign a new value. If we're the first to use a >> + * VMID for the new generation, we must flush necessary caches and TLBs on all >> + * CPUs. >> + */ >> +static bool check_new_vmid_gen(struct kvm *kvm) >> +{ >> + return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen)); >> +} > > Better have the name indicate what a true return value means, like > 'need_new_vmid_gen()'. > agreed >> + >> +/** >> + * update_vttbr - Update the VTTBR with a valid VMID before the guest runs >> + * @kvm The guest that we are about to run >> + * >> + * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the >> + * VM has a valid VMID, otherwise assigns a new one and flushes corresponding >> + * caches and TLBs. >> + */ >> +static void update_vttbr(struct kvm *kvm) >> +{ >> + phys_addr_t pgd_phys; >> + >> + if (!check_new_vmid_gen(kvm)) >> + return; >> + >> + spin_lock(&kvm_vmid_lock); >> + >> + /* First user of a new VMID generation? */ >> + if (unlikely(kvm_next_vmid == 0)) { >> + atomic64_inc(&kvm_vmid_gen); >> + kvm_next_vmid = 1; >> + >> + /* This does nothing on UP */ >> + smp_call_function(reset_vm_context, NULL, 1); >> + >> + /* >> + * On SMP we know no other CPUs can use this CPU's or >> + * each other's VMID since the kvm_vmid_lock blocks >> + * them from reentry to the guest. >> + */ >> + >> + reset_vm_context(NULL); > > on_each_cpu() will combine the two lines above. > thanks >> + } >> + >> + kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen); >> + kvm->arch.vmid = kvm_next_vmid; >> + kvm_next_vmid++; >> + >> + /* update vttbr to be used with the new vmid */ >> + pgd_phys = virt_to_phys(kvm->arch.pgd); >> + kvm->arch.vttbr = pgd_phys & ((1LLU << 40) - 1) >> + & ~((2 << VTTBR_X) - 1); >> + kvm->arch.vttbr |= (u64)(kvm->arch.vmid) << 48; >> + >> + spin_unlock(&kvm_vmid_lock); >> +} >> + >> +/* >> + * Return 0 to return to guest, < 0 on error, exit_reason ( > 0) on proper >> + * exit to QEMU. >> + */ >> +static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, >> + int exception_index) >> +{ >> + return -EINVAL; > > x86 returns KVM_EXIT_INTERNAL_ERROR when it encounters an unhandlable > exit. -EINVAL indicates that the user has done something wrong, which > isn't the case here. > ok, fair enough this has been reworked as per your comments. >> +} >> + >> +/* >> + * Return 0 to proceed with guest entry >> + */ >> +static int vcpu_pre_guest_enter(struct kvm_vcpu *vcpu, int *exit_reason) >> +{ >> + if (signal_pending(current)) { >> + *exit_reason = KVM_EXIT_INTR; >> + return -EINTR; >> + } >> + >> + if (check_new_vmid_gen(vcpu->kvm)) >> + return 1; >> + >> + BUG_ON(__vcpu_mode(*vcpu_cpsr(vcpu)) == 0xf); >> + >> return 0; >> } >> >> +/** >> + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code >> + * @vcpu: The VCPU pointer >> + * @run: The kvm_run structure pointer used for userspace state exchange >> + * >> + * This function is called through the VCPU_RUN ioctl called from user space. It >> + * will execute VM code in a loop until the time slice for the process is used >> + * or some emulation is needed from user space in which case the function will >> + * return with return value 0 and with the kvm_run structure filled in with the >> + * required data for the requested emulation. >> + */ >> int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) >> { >> - return -EINVAL; >> + int ret = 0; >> + int exit_reason; >> + sigset_t sigsaved; >> + >> + if (vcpu->sigset_active) >> + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); >> + > > We should move this to common code. But I don't mind if this is done > post merge. > >> + exit_reason = KVM_EXIT_UNKNOWN; >> + while (exit_reason == KVM_EXIT_UNKNOWN) { > > Looping over 'ret' is more in line with x86 and clearer IMO. x86 uses > the convention: < 0 -> return to userspace with error, 0 -> return to > userspace, 1 -> loop. > I duplicated this scheme now for next patch series. >> + /* >> + * Check conditions before entering the guest >> + */ >> + cond_resched(); >> + >> + update_vttbr(vcpu->kvm); >> + >> + local_irq_disable(); >> + >> + /* Re-check atomic conditions */ >> + ret = vcpu_pre_guest_enter(vcpu, &exit_reason); >> + if (ret != 0) { >> + local_irq_enable(); >> + preempt_enable(); >> + continue; > > See - you continue, only to break out of the loop due to a side effect > on exit_reason. > yep, I was never in love with this. >> + } >> + >> + /************************************************************** >> + * Enter the guest >> + */ >> + trace_kvm_entry(vcpu->arch.regs.pc); >> + kvm_guest_enter(); >> + vcpu->mode = IN_GUEST_MODE; >> + >> + ret = __kvm_vcpu_run(vcpu); >> + >> + vcpu->mode = OUTSIDE_GUEST_MODE; >> + vcpu->stat.exits++; > > The tracepoint above should be sufficient for statistics. > Marc added this one, so I assume there's a valid need for the separate counter. Marc? >> + kvm_guest_exit(); >> + trace_kvm_exit(vcpu->arch.regs.pc); >> + local_irq_enable(); >> + >> + /* >> + * Back from guest >> + *************************************************************/ >> + >> + ret = handle_exit(vcpu, run, ret); >> + if (ret < 0) { >> + kvm_err("Error in handle_exit\n"); >> + break; >> + } else { >> + exit_reason = ret; /* 0 == KVM_EXIT_UNKNOWN */ >> + } >> + } >> + >> + if (vcpu->sigset_active) >> + sigprocmask(SIG_SETMASK, &sigsaved, NULL); >> + >> + run->exit_reason = exit_reason; >> + return ret; >> } >> > Thanks! -Christoffer -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html