Re: [PATCH v9 12/16] ARM: KVM: World-switch implementation

Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> · Wed, 25 Jul 2012 16:16:55 +0200

On Tue, Jul 3, 2012 at 12:07 PM, Avi Kivity <avi@xxxxxxxxxx> wrote:
> On 07/03/2012 12:01 PM, Christoffer Dall wrote:
>> Provides complete world-switch implementation to switch to other guests
>> running in non-secure modes. Includes Hyp exception handlers that
>> capture necessary exception information and stores the information on
>> the VCPU and KVM structures.
>>
>> The following Hyp-ABI is also documented in the code:
>>
>> Hyp-ABI: Switching from host kernel to Hyp-mode:
>>    Switching to Hyp mode is done through a simple HVC instructions. The
>>    exception vector code will check that the HVC comes from VMID==0 and if
>>    so will store the necessary state on the Hyp stack, which will look like
>>    this (growing downwards, see the hyp_hvc handler):
>>      ...
>>      stack_page + 4: spsr (Host-SVC cpsr)
>>      stack_page    : lr_usr
>>      --------------: stack bottom
>>
>> Hyp-ABI: Switching from Hyp-mode to host kernel SVC mode:
>>    When returning from Hyp mode to SVC mode, another HVC instruction is
>>    executed from Hyp mode, which is taken in the hyp_svc handler. The
>>    bottom of the Hyp is derived from the Hyp stack pointer (only a single
>>    page aligned stack is used per CPU) and the initial SVC registers are
>>    used to restore the host state.
>>
>> Otherwise, the world-switch is pretty straight-forward. All state that
>> can be modified by the guest is first backed up on the Hyp stack and the
>> VCPU values is loaded onto the hardware. State, which is not loaded, but
>> theoretically modifiable by the guest is protected through the
>> virtualiation features to generate a trap and cause software emulation.
>> Upon guest returns, all state is restored from hardware onto the VCPU
>> struct and the original state is restored from the Hyp-stack onto the
>> hardware.
>>
>> One controversy may be the back-door call to __irq_svc (the host
>> kernel's own physical IRQ handler) which is called when a physical IRQ
>> exception is taken in Hyp mode while running in the guest.
>>
>> SMP support using the VMPIDR calculated on the basis of the host MPIDR
>> and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier.
>
> He should sign off on this patch then.
>
>>
>> Reuse of VMIDs has been implemented by Antonios Motakis and adapated from
>> a separate patch into the appropriate patches introducing the
>> functionality. Note that the VMIDs are stored per VM as required by the ARM
>> architecture reference manual.
>
> Ditto.
>
>> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
>> index 220f241..232117c 100644
>> --- a/arch/arm/include/asm/kvm_arm.h
>> +++ b/arch/arm/include/asm/kvm_arm.h
>> @@ -105,6 +105,17 @@
>>  #define TTBCR_T0SZ   3
>>  #define HTCR_MASK    (TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)
>>
>> +/* Hyp System Trap Register */
>> +#define HSTR_T(x)    (1 << x)
>> +#define HSTR_TTEE    (1 << 16)
>> +#define HSTR_TJDBX   (1 << 17)
>> +
>> +/* Hyp Coprocessor Trap Register */
>> +#define HCPTR_TCP(x) (1 << x)
>> +#define HCPTR_TCP_MASK       (0x3fff)
>> +#define HCPTR_TASE   (1 << 15)
>> +#define HCPTR_TTA    (1 << 20)
>> +#define HCPTR_TCPAC  (1 << 31)
>>
>>  /* Virtualization Translation Control Register (VTCR) bits */
>>  #define VTCR_SH0     (3 << 12)
>> @@ -126,5 +137,31 @@
>>  #define VTTBR_X              (5 - VTCR_GUEST_T0SZ)
>>  #endif
>>
>> +/* Hyp Syndrome Register (HSR) bits */
>> +#define HSR_EC_SHIFT (26)
>> +#define HSR_EC               (0x3fU << HSR_EC_SHIFT)
>> +#define HSR_IL               (1U << 25)
>> +#define HSR_ISS              (HSR_IL - 1)
>> +#define HSR_ISV_SHIFT        (24)
>> +#define HSR_ISV              (1U << HSR_ISV_SHIFT)
>> +
>> +#define HSR_EC_UNKNOWN       (0x00)
>> +#define HSR_EC_WFI   (0x01)
>> +#define HSR_EC_CP15_32       (0x03)
>> +#define HSR_EC_CP15_64       (0x04)
>> +#define HSR_EC_CP14_MR       (0x05)
>> +#define HSR_EC_CP14_LS       (0x06)
>> +#define HSR_EC_CP_0_13       (0x07)
>> +#define HSR_EC_CP10_ID       (0x08)
>> +#define HSR_EC_JAZELLE       (0x09)
>> +#define HSR_EC_BXJ   (0x0A)
>> +#define HSR_EC_CP14_64       (0x0C)
>> +#define HSR_EC_SVC_HYP       (0x11)
>> +#define HSR_EC_HVC   (0x12)
>> +#define HSR_EC_SMC   (0x13)
>> +#define HSR_EC_IABT  (0x20)
>> +#define HSR_EC_IABT_HYP      (0x21)
>> +#define HSR_EC_DABT  (0x24)
>> +#define HSR_EC_DABT_HYP      (0x25)
>>
>>  #endif /* __ARM_KVM_ARM_H__ */
>> diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
>> index b57c75e..38d3a12 100644
>> --- a/arch/arm/kernel/armksyms.c
>> +++ b/arch/arm/kernel/armksyms.c
>> @@ -48,6 +48,13 @@ extern void __aeabi_ulcmp(void);
>>
>>  extern void fpundefinstr(void);
>>
>> +#ifdef CONFIG_KVM_ARM_HOST
>> +/* This is needed for KVM */
>> +extern void __irq_svc(void);
>> +
>> +EXPORT_SYMBOL_GPL(__irq_svc);
>> +#endif
>> +
>>       /* platform dependent support */
>>  EXPORT_SYMBOL(__udelay);
>>  EXPORT_SYMBOL(__const_udelay);
>> diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
>> index 1429d89..9c76b53 100644
>> --- a/arch/arm/kernel/asm-offsets.c
>> +++ b/arch/arm/kernel/asm-offsets.c
>> @@ -13,6 +13,7 @@
>>  #include <linux/sched.h>
>>  #include <linux/mm.h>
>>  #include <linux/dma-mapping.h>
>> +#include <linux/kvm_host.h>
>>  #include <asm/cacheflush.h>
>>  #include <asm/glue-df.h>
>>  #include <asm/glue-pf.h>
>> @@ -144,5 +145,47 @@ int main(void)
>>    DEFINE(DMA_BIDIRECTIONAL,  DMA_BIDIRECTIONAL);
>>    DEFINE(DMA_TO_DEVICE,              DMA_TO_DEVICE);
>>    DEFINE(DMA_FROM_DEVICE,    DMA_FROM_DEVICE);
>> +#ifdef CONFIG_KVM_ARM_HOST
>> +  DEFINE(VCPU_KVM,           offsetof(struct kvm_vcpu, kvm));
>> +  DEFINE(VCPU_MIDR,          offsetof(struct kvm_vcpu, arch.cp15[c0_MIDR]));
>> +  DEFINE(VCPU_MPIDR,         offsetof(struct kvm_vcpu, arch.cp15[c0_MPIDR]));
>> +  DEFINE(VCPU_SCTLR,         offsetof(struct kvm_vcpu, arch.cp15[c1_SCTLR]));
>> +  DEFINE(VCPU_CPACR,         offsetof(struct kvm_vcpu, arch.cp15[c1_CPACR]));
>> +  DEFINE(VCPU_TTBR0,         offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR0]));
>> +  DEFINE(VCPU_TTBR1,         offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR1]));
>> +  DEFINE(VCPU_TTBCR,         offsetof(struct kvm_vcpu, arch.cp15[c2_TTBCR]));
>> +  DEFINE(VCPU_DACR,          offsetof(struct kvm_vcpu, arch.cp15[c3_DACR]));
>> +  DEFINE(VCPU_DFSR,          offsetof(struct kvm_vcpu, arch.cp15[c5_DFSR]));
>> +  DEFINE(VCPU_IFSR,          offsetof(struct kvm_vcpu, arch.cp15[c5_IFSR]));
>> +  DEFINE(VCPU_ADFSR,         offsetof(struct kvm_vcpu, arch.cp15[c5_ADFSR]));
>> +  DEFINE(VCPU_AIFSR,         offsetof(struct kvm_vcpu, arch.cp15[c5_AIFSR]));
>> +  DEFINE(VCPU_DFAR,          offsetof(struct kvm_vcpu, arch.cp15[c6_DFAR]));
>> +  DEFINE(VCPU_IFAR,          offsetof(struct kvm_vcpu, arch.cp15[c6_IFAR]));
>> +  DEFINE(VCPU_PRRR,          offsetof(struct kvm_vcpu, arch.cp15[c10_PRRR]));
>> +  DEFINE(VCPU_NMRR,          offsetof(struct kvm_vcpu, arch.cp15[c10_NMRR]));
>> +  DEFINE(VCPU_VBAR,          offsetof(struct kvm_vcpu, arch.cp15[c12_VBAR]));
>> +  DEFINE(VCPU_CID,           offsetof(struct kvm_vcpu, arch.cp15[c13_CID]));
>> +  DEFINE(VCPU_TID_URW,               offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URW]));
>> +  DEFINE(VCPU_TID_URO,               offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URO]));
>> +  DEFINE(VCPU_TID_PRIV,              offsetof(struct kvm_vcpu, arch.cp15[c13_TID_PRIV]));
>> +  DEFINE(VCPU_REGS,          offsetof(struct kvm_vcpu, arch.regs));
>> +  DEFINE(VCPU_USR_REGS,              offsetof(struct kvm_vcpu, arch.regs.usr_regs));
>> +  DEFINE(VCPU_SVC_REGS,              offsetof(struct kvm_vcpu, arch.regs.svc_regs));
>> +  DEFINE(VCPU_ABT_REGS,              offsetof(struct kvm_vcpu, arch.regs.abt_regs));
>> +  DEFINE(VCPU_UND_REGS,              offsetof(struct kvm_vcpu, arch.regs.und_regs));
>> +  DEFINE(VCPU_IRQ_REGS,              offsetof(struct kvm_vcpu, arch.regs.irq_regs));
>> +  DEFINE(VCPU_FIQ_REGS,              offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
>> +  DEFINE(VCPU_PC,            offsetof(struct kvm_vcpu, arch.regs.pc));
>> +  DEFINE(VCPU_CPSR,          offsetof(struct kvm_vcpu, arch.regs.cpsr));
>> +  DEFINE(VCPU_IRQ_LINES,     offsetof(struct kvm_vcpu, arch.irq_lines));
>> +  DEFINE(VCPU_HSR,           offsetof(struct kvm_vcpu, arch.hsr));
>> +  DEFINE(VCPU_HDFAR,         offsetof(struct kvm_vcpu, arch.hdfar));
>> +  DEFINE(VCPU_HIFAR,         offsetof(struct kvm_vcpu, arch.hifar));
>> +  DEFINE(VCPU_HPFAR,         offsetof(struct kvm_vcpu, arch.hpfar));
>> +  DEFINE(VCPU_PC_IPA,                offsetof(struct kvm_vcpu, arch.pc_ipa));
>> +  DEFINE(VCPU_PC_IPA2,               offsetof(struct kvm_vcpu, arch.pc_ipa2));
>> +  DEFINE(VCPU_HYP_PC,                offsetof(struct kvm_vcpu, arch.hyp_pc));
>> +  DEFINE(KVM_VTTBR,          offsetof(struct kvm, arch.vttbr));
>> +#endif
>>    return 0;
>>  }
>> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
>> index 437f0c4..db029bb 100644
>> --- a/arch/arm/kernel/entry-armv.S
>> +++ b/arch/arm/kernel/entry-armv.S
>> @@ -209,6 +209,7 @@ __dabt_svc:
>>  ENDPROC(__dabt_svc)
>>
>>       .align  5
>> +     .globl __irq_svc
>>  __irq_svc:
>>       svc_entry
>>       irq_handler
>> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
>> index 8b024ee..4687690 100644
>> --- a/arch/arm/kvm/arm.c
>> +++ b/arch/arm/kvm/arm.c
>> @@ -37,12 +37,19 @@
>>  #include <asm/mman.h>
>>  #include <asm/idmap.h>
>>  #include <asm/tlbflush.h>
>> +#include <asm/cputype.h>
>>  #include <asm/kvm_arm.h>
>>  #include <asm/kvm_asm.h>
>>  #include <asm/kvm_mmu.h>
>> +#include <asm/kvm_emulate.h>
>>
>>  static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
>>
>> +/* The VMID used in the VTTBR */
>> +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
>> +static u8 kvm_next_vmid;
>> +DEFINE_SPINLOCK(kvm_vmid_lock);
>
> static, too.
>
>> +
>> +
>> +/**
>> + * check_new_vmid_gen - check that the VMID is still valid
>> + * @kvm: The VM's VMID to checkt
>> + *
>> + * return true if there is a new generation of VMIDs being used
>> + *
>> + * The hardware supports only 256 values with the value zero reserved for the
>> + * host, so we check if an assigned value belongs to a previous generation,
>> + * which which requires us to assign a new value. If we're the first to use a
>> + * VMID for the new generation, we must flush necessary caches and TLBs on all
>> + * CPUs.
>> + */
>> +static bool check_new_vmid_gen(struct kvm *kvm)
>> +{
>> +     return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
>> +}
>
> Better have the name indicate what a true return value means, like
> 'need_new_vmid_gen()'.
>
agreed
>> +
>> +/**
>> + * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
>> + * @kvm      The guest that we are about to run
>> + *
>> + * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
>> + * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
>> + * caches and TLBs.
>> + */
>> +static void update_vttbr(struct kvm *kvm)
>> +{
>> +     phys_addr_t pgd_phys;
>> +
>> +     if (!check_new_vmid_gen(kvm))
>> +             return;
>> +
>> +     spin_lock(&kvm_vmid_lock);
>> +
>> +     /* First user of a new VMID generation? */
>> +     if (unlikely(kvm_next_vmid == 0)) {
>> +             atomic64_inc(&kvm_vmid_gen);
>> +             kvm_next_vmid = 1;
>> +
>> +             /* This does nothing on UP */
>> +             smp_call_function(reset_vm_context, NULL, 1);
>> +
>> +             /*
>> +              * On SMP we know no other CPUs can use this CPU's or
>> +              * each other's VMID since the kvm_vmid_lock blocks
>> +              * them from reentry to the guest.
>> +              */
>> +
>> +             reset_vm_context(NULL);
>
> on_each_cpu() will combine the two lines above.
>
thanks
>> +     }
>> +
>> +     kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
>> +     kvm->arch.vmid = kvm_next_vmid;
>> +     kvm_next_vmid++;
>> +
>> +     /* update vttbr to be used with the new vmid */
>> +     pgd_phys = virt_to_phys(kvm->arch.pgd);
>> +     kvm->arch.vttbr = pgd_phys & ((1LLU << 40) - 1)
>> +                       & ~((2 << VTTBR_X) - 1);
>> +     kvm->arch.vttbr |= (u64)(kvm->arch.vmid) << 48;
>> +
>> +     spin_unlock(&kvm_vmid_lock);
>> +}
>> +
>> +/*
>> + * Return 0 to return to guest, < 0 on error, exit_reason ( > 0) on proper
>> + * exit to QEMU.
>> + */
>> +static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
>> +                    int exception_index)
>> +{
>> +     return -EINVAL;
>
> x86 returns KVM_EXIT_INTERNAL_ERROR when it encounters an unhandlable
> exit.  -EINVAL indicates that the user has done something wrong, which
> isn't the case here.
>

ok, fair enough this has been reworked as per your comments.

>> +}
>> +
>> +/*
>> + * Return 0 to proceed with guest entry
>> + */
>> +static int vcpu_pre_guest_enter(struct kvm_vcpu *vcpu, int *exit_reason)
>> +{
>> +     if (signal_pending(current)) {
>> +             *exit_reason = KVM_EXIT_INTR;
>> +             return -EINTR;
>> +     }
>> +
>> +     if (check_new_vmid_gen(vcpu->kvm))
>> +             return 1;
>> +
>> +     BUG_ON(__vcpu_mode(*vcpu_cpsr(vcpu)) == 0xf);
>> +
>>       return 0;
>>  }
>>
>> +/**
>> + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
>> + * @vcpu:    The VCPU pointer
>> + * @run:     The kvm_run structure pointer used for userspace state exchange
>> + *
>> + * This function is called through the VCPU_RUN ioctl called from user space. It
>> + * will execute VM code in a loop until the time slice for the process is used
>> + * or some emulation is needed from user space in which case the function will
>> + * return with return value 0 and with the kvm_run structure filled in with the
>> + * required data for the requested emulation.
>> + */
>>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>  {
>> -     return -EINVAL;
>> +     int ret = 0;
>> +     int exit_reason;
>> +     sigset_t sigsaved;
>> +
>> +     if (vcpu->sigset_active)
>> +             sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
>> +
>
> We should move this to common code.  But I don't mind if this is done
> post merge.
>
>> +     exit_reason = KVM_EXIT_UNKNOWN;
>> +     while (exit_reason == KVM_EXIT_UNKNOWN) {
>
> Looping over 'ret' is more in line with x86 and clearer IMO.  x86 uses
> the convention: < 0 -> return to userspace with error, 0 -> return to
> userspace, 1 -> loop.
>

I duplicated this scheme now for next patch series.

>> +             /*
>> +              * Check conditions before entering the guest
>> +              */
>> +             cond_resched();
>> +
>> +             update_vttbr(vcpu->kvm);
>> +
>> +             local_irq_disable();
>> +
>> +             /* Re-check atomic conditions */
>> +             ret = vcpu_pre_guest_enter(vcpu, &exit_reason);
>> +             if (ret != 0) {
>> +                     local_irq_enable();
>> +                     preempt_enable();
>> +                     continue;
>
> See - you continue, only to break out of the loop due to a side effect
> on exit_reason.
>

yep, I was never in love with this.

>> +             }
>> +
>> +             /**************************************************************
>> +              * Enter the guest
>> +              */
>> +             trace_kvm_entry(vcpu->arch.regs.pc);
>> +             kvm_guest_enter();
>> +             vcpu->mode = IN_GUEST_MODE;
>> +
>> +             ret = __kvm_vcpu_run(vcpu);
>> +
>> +             vcpu->mode = OUTSIDE_GUEST_MODE;
>> +             vcpu->stat.exits++;
>
> The tracepoint above should be sufficient for statistics.
>

Marc added this one, so I assume there's a valid need for the separate
counter. Marc?

>> +             kvm_guest_exit();
>> +             trace_kvm_exit(vcpu->arch.regs.pc);
>> +             local_irq_enable();
>> +
>> +             /*
>> +              * Back from guest
>> +              *************************************************************/
>> +
>> +             ret = handle_exit(vcpu, run, ret);
>> +             if (ret < 0) {
>> +                     kvm_err("Error in handle_exit\n");
>> +                     break;
>> +             } else {
>> +                     exit_reason = ret; /* 0 == KVM_EXIT_UNKNOWN */
>> +             }
>> +     }
>> +
>> +     if (vcpu->sigset_active)
>> +             sigprocmask(SIG_SETMASK, &sigsaved, NULL);
>> +
>> +     run->exit_reason = exit_reason;
>> +     return ret;
>>  }
>>
>

Thanks!
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html