Re: [PATCH v9 12/16] ARM: KVM: World-switch implementation

Avi Kivity <avi@xxxxxxxxxx> · Tue, 03 Jul 2012 13:07:00 +0300

On 07/03/2012 12:01 PM, Christoffer Dall wrote:
> Provides complete world-switch implementation to switch to other guests
> running in non-secure modes. Includes Hyp exception handlers that
> capture necessary exception information and stores the information on
> the VCPU and KVM structures.
> 
> The following Hyp-ABI is also documented in the code:
> 
> Hyp-ABI: Switching from host kernel to Hyp-mode:
>    Switching to Hyp mode is done through a simple HVC instructions. The
>    exception vector code will check that the HVC comes from VMID==0 and if
>    so will store the necessary state on the Hyp stack, which will look like
>    this (growing downwards, see the hyp_hvc handler):
>      ...
>      stack_page + 4: spsr (Host-SVC cpsr)
>      stack_page    : lr_usr
>      --------------: stack bottom
> 
> Hyp-ABI: Switching from Hyp-mode to host kernel SVC mode:
>    When returning from Hyp mode to SVC mode, another HVC instruction is
>    executed from Hyp mode, which is taken in the hyp_svc handler. The
>    bottom of the Hyp is derived from the Hyp stack pointer (only a single
>    page aligned stack is used per CPU) and the initial SVC registers are
>    used to restore the host state.
> 
> Otherwise, the world-switch is pretty straight-forward. All state that
> can be modified by the guest is first backed up on the Hyp stack and the
> VCPU values is loaded onto the hardware. State, which is not loaded, but
> theoretically modifiable by the guest is protected through the
> virtualiation features to generate a trap and cause software emulation.
> Upon guest returns, all state is restored from hardware onto the VCPU
> struct and the original state is restored from the Hyp-stack onto the
> hardware.
> 
> One controversy may be the back-door call to __irq_svc (the host
> kernel's own physical IRQ handler) which is called when a physical IRQ
> exception is taken in Hyp mode while running in the guest.
> 
> SMP support using the VMPIDR calculated on the basis of the host MPIDR
> and overriding the low bits with KVM vcpu_id contributed by Marc Zyngier.

He should sign off on this patch then.

> 
> Reuse of VMIDs has been implemented by Antonios Motakis and adapated from
> a separate patch into the appropriate patches introducing the
> functionality. Note that the VMIDs are stored per VM as required by the ARM
> architecture reference manual.

Ditto.

> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
> index 220f241..232117c 100644
> --- a/arch/arm/include/asm/kvm_arm.h
> +++ b/arch/arm/include/asm/kvm_arm.h
> @@ -105,6 +105,17 @@
>  #define TTBCR_T0SZ	3
>  #define HTCR_MASK	(TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0)
>  
> +/* Hyp System Trap Register */
> +#define HSTR_T(x)	(1 << x)
> +#define HSTR_TTEE	(1 << 16)
> +#define HSTR_TJDBX	(1 << 17)
> +
> +/* Hyp Coprocessor Trap Register */
> +#define HCPTR_TCP(x)	(1 << x)
> +#define HCPTR_TCP_MASK	(0x3fff)
> +#define HCPTR_TASE	(1 << 15)
> +#define HCPTR_TTA	(1 << 20)
> +#define HCPTR_TCPAC	(1 << 31)
>  
>  /* Virtualization Translation Control Register (VTCR) bits */
>  #define VTCR_SH0	(3 << 12)
> @@ -126,5 +137,31 @@
>  #define VTTBR_X		(5 - VTCR_GUEST_T0SZ)
>  #endif
>  
> +/* Hyp Syndrome Register (HSR) bits */
> +#define HSR_EC_SHIFT	(26)
> +#define HSR_EC		(0x3fU << HSR_EC_SHIFT)
> +#define HSR_IL		(1U << 25)
> +#define HSR_ISS		(HSR_IL - 1)
> +#define HSR_ISV_SHIFT	(24)
> +#define HSR_ISV		(1U << HSR_ISV_SHIFT)
> +
> +#define HSR_EC_UNKNOWN	(0x00)
> +#define HSR_EC_WFI	(0x01)
> +#define HSR_EC_CP15_32	(0x03)
> +#define HSR_EC_CP15_64	(0x04)
> +#define HSR_EC_CP14_MR	(0x05)
> +#define HSR_EC_CP14_LS	(0x06)
> +#define HSR_EC_CP_0_13	(0x07)
> +#define HSR_EC_CP10_ID	(0x08)
> +#define HSR_EC_JAZELLE	(0x09)
> +#define HSR_EC_BXJ	(0x0A)
> +#define HSR_EC_CP14_64	(0x0C)
> +#define HSR_EC_SVC_HYP	(0x11)
> +#define HSR_EC_HVC	(0x12)
> +#define HSR_EC_SMC	(0x13)
> +#define HSR_EC_IABT	(0x20)
> +#define HSR_EC_IABT_HYP	(0x21)
> +#define HSR_EC_DABT	(0x24)
> +#define HSR_EC_DABT_HYP	(0x25)
>  
>  #endif /* __ARM_KVM_ARM_H__ */
> diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
> index b57c75e..38d3a12 100644
> --- a/arch/arm/kernel/armksyms.c
> +++ b/arch/arm/kernel/armksyms.c
> @@ -48,6 +48,13 @@ extern void __aeabi_ulcmp(void);
>  
>  extern void fpundefinstr(void);
>  
> +#ifdef CONFIG_KVM_ARM_HOST
> +/* This is needed for KVM */
> +extern void __irq_svc(void);
> +
> +EXPORT_SYMBOL_GPL(__irq_svc);
> +#endif
> +
>  	/* platform dependent support */
>  EXPORT_SYMBOL(__udelay);
>  EXPORT_SYMBOL(__const_udelay);
> diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
> index 1429d89..9c76b53 100644
> --- a/arch/arm/kernel/asm-offsets.c
> +++ b/arch/arm/kernel/asm-offsets.c
> @@ -13,6 +13,7 @@
>  #include <linux/sched.h>
>  #include <linux/mm.h>
>  #include <linux/dma-mapping.h>
> +#include <linux/kvm_host.h>
>  #include <asm/cacheflush.h>
>  #include <asm/glue-df.h>
>  #include <asm/glue-pf.h>
> @@ -144,5 +145,47 @@ int main(void)
>    DEFINE(DMA_BIDIRECTIONAL,	DMA_BIDIRECTIONAL);
>    DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
>    DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
> +#ifdef CONFIG_KVM_ARM_HOST
> +  DEFINE(VCPU_KVM,		offsetof(struct kvm_vcpu, kvm));
> +  DEFINE(VCPU_MIDR,		offsetof(struct kvm_vcpu, arch.cp15[c0_MIDR]));
> +  DEFINE(VCPU_MPIDR,		offsetof(struct kvm_vcpu, arch.cp15[c0_MPIDR]));
> +  DEFINE(VCPU_SCTLR,		offsetof(struct kvm_vcpu, arch.cp15[c1_SCTLR]));
> +  DEFINE(VCPU_CPACR,		offsetof(struct kvm_vcpu, arch.cp15[c1_CPACR]));
> +  DEFINE(VCPU_TTBR0,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR0]));
> +  DEFINE(VCPU_TTBR1,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBR1]));
> +  DEFINE(VCPU_TTBCR,		offsetof(struct kvm_vcpu, arch.cp15[c2_TTBCR]));
> +  DEFINE(VCPU_DACR,		offsetof(struct kvm_vcpu, arch.cp15[c3_DACR]));
> +  DEFINE(VCPU_DFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_DFSR]));
> +  DEFINE(VCPU_IFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_IFSR]));
> +  DEFINE(VCPU_ADFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_ADFSR]));
> +  DEFINE(VCPU_AIFSR,		offsetof(struct kvm_vcpu, arch.cp15[c5_AIFSR]));
> +  DEFINE(VCPU_DFAR,		offsetof(struct kvm_vcpu, arch.cp15[c6_DFAR]));
> +  DEFINE(VCPU_IFAR,		offsetof(struct kvm_vcpu, arch.cp15[c6_IFAR]));
> +  DEFINE(VCPU_PRRR,		offsetof(struct kvm_vcpu, arch.cp15[c10_PRRR]));
> +  DEFINE(VCPU_NMRR,		offsetof(struct kvm_vcpu, arch.cp15[c10_NMRR]));
> +  DEFINE(VCPU_VBAR,		offsetof(struct kvm_vcpu, arch.cp15[c12_VBAR]));
> +  DEFINE(VCPU_CID,		offsetof(struct kvm_vcpu, arch.cp15[c13_CID]));
> +  DEFINE(VCPU_TID_URW,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URW]));
> +  DEFINE(VCPU_TID_URO,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_URO]));
> +  DEFINE(VCPU_TID_PRIV,		offsetof(struct kvm_vcpu, arch.cp15[c13_TID_PRIV]));
> +  DEFINE(VCPU_REGS,		offsetof(struct kvm_vcpu, arch.regs));
> +  DEFINE(VCPU_USR_REGS,		offsetof(struct kvm_vcpu, arch.regs.usr_regs));
> +  DEFINE(VCPU_SVC_REGS,		offsetof(struct kvm_vcpu, arch.regs.svc_regs));
> +  DEFINE(VCPU_ABT_REGS,		offsetof(struct kvm_vcpu, arch.regs.abt_regs));
> +  DEFINE(VCPU_UND_REGS,		offsetof(struct kvm_vcpu, arch.regs.und_regs));
> +  DEFINE(VCPU_IRQ_REGS,		offsetof(struct kvm_vcpu, arch.regs.irq_regs));
> +  DEFINE(VCPU_FIQ_REGS,		offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
> +  DEFINE(VCPU_PC,		offsetof(struct kvm_vcpu, arch.regs.pc));
> +  DEFINE(VCPU_CPSR,		offsetof(struct kvm_vcpu, arch.regs.cpsr));
> +  DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
> +  DEFINE(VCPU_HSR,		offsetof(struct kvm_vcpu, arch.hsr));
> +  DEFINE(VCPU_HDFAR,		offsetof(struct kvm_vcpu, arch.hdfar));
> +  DEFINE(VCPU_HIFAR,		offsetof(struct kvm_vcpu, arch.hifar));
> +  DEFINE(VCPU_HPFAR,		offsetof(struct kvm_vcpu, arch.hpfar));
> +  DEFINE(VCPU_PC_IPA,		offsetof(struct kvm_vcpu, arch.pc_ipa));
> +  DEFINE(VCPU_PC_IPA2,		offsetof(struct kvm_vcpu, arch.pc_ipa2));
> +  DEFINE(VCPU_HYP_PC,		offsetof(struct kvm_vcpu, arch.hyp_pc));
> +  DEFINE(KVM_VTTBR,		offsetof(struct kvm, arch.vttbr));
> +#endif
>    return 0; 
>  }
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index 437f0c4..db029bb 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -209,6 +209,7 @@ __dabt_svc:
>  ENDPROC(__dabt_svc)
>  
>  	.align	5
> +	.globl __irq_svc
>  __irq_svc:
>  	svc_entry
>  	irq_handler
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 8b024ee..4687690 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -37,12 +37,19 @@
>  #include <asm/mman.h>
>  #include <asm/idmap.h>
>  #include <asm/tlbflush.h>
> +#include <asm/cputype.h>
>  #include <asm/kvm_arm.h>
>  #include <asm/kvm_asm.h>
>  #include <asm/kvm_mmu.h>
> +#include <asm/kvm_emulate.h>
>  
>  static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
>  
> +/* The VMID used in the VTTBR */
> +static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
> +static u8 kvm_next_vmid;
> +DEFINE_SPINLOCK(kvm_vmid_lock);

static, too.

> +
> +
> +/**
> + * check_new_vmid_gen - check that the VMID is still valid
> + * @kvm: The VM's VMID to checkt
> + *
> + * return true if there is a new generation of VMIDs being used
> + *
> + * The hardware supports only 256 values with the value zero reserved for the
> + * host, so we check if an assigned value belongs to a previous generation,
> + * which which requires us to assign a new value. If we're the first to use a
> + * VMID for the new generation, we must flush necessary caches and TLBs on all
> + * CPUs.
> + */
> +static bool check_new_vmid_gen(struct kvm *kvm)
> +{
> +	return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
> +}

Better have the name indicate what a true return value means, like
'need_new_vmid_gen()'.

> +
> +/**
> + * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
> + * @kvm	The guest that we are about to run
> + *
> + * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
> + * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
> + * caches and TLBs.
> + */
> +static void update_vttbr(struct kvm *kvm)
> +{
> +	phys_addr_t pgd_phys;
> +
> +	if (!check_new_vmid_gen(kvm))
> +		return;
> +
> +	spin_lock(&kvm_vmid_lock);
> +
> +	/* First user of a new VMID generation? */
> +	if (unlikely(kvm_next_vmid == 0)) {
> +		atomic64_inc(&kvm_vmid_gen);
> +		kvm_next_vmid = 1;
> +
> +		/* This does nothing on UP */
> +		smp_call_function(reset_vm_context, NULL, 1);
> +
> +		/*
> +		 * On SMP we know no other CPUs can use this CPU's or
> +		 * each other's VMID since the kvm_vmid_lock blocks
> +		 * them from reentry to the guest.
> +		 */
> +
> +		reset_vm_context(NULL);

on_each_cpu() will combine the two lines above.

> +	}
> +
> +	kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
> +	kvm->arch.vmid = kvm_next_vmid;
> +	kvm_next_vmid++;
> +
> +	/* update vttbr to be used with the new vmid */
> +	pgd_phys = virt_to_phys(kvm->arch.pgd);
> +	kvm->arch.vttbr = pgd_phys & ((1LLU << 40) - 1)
> +			  & ~((2 << VTTBR_X) - 1);
> +	kvm->arch.vttbr |= (u64)(kvm->arch.vmid) << 48;
> +
> +	spin_unlock(&kvm_vmid_lock);
> +}
> +
> +/*
> + * Return 0 to return to guest, < 0 on error, exit_reason ( > 0) on proper
> + * exit to QEMU.
> + */
> +static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
> +		       int exception_index)
> +{
> +	return -EINVAL;

x86 returns KVM_EXIT_INTERNAL_ERROR when it encounters an unhandlable
exit.  -EINVAL indicates that the user has done something wrong, which
isn't the case here.

> +}
> +
> +/*
> + * Return 0 to proceed with guest entry
> + */
> +static int vcpu_pre_guest_enter(struct kvm_vcpu *vcpu, int *exit_reason)
> +{
> +	if (signal_pending(current)) {
> +		*exit_reason = KVM_EXIT_INTR;
> +		return -EINTR;
> +	}
> +
> +	if (check_new_vmid_gen(vcpu->kvm))
> +		return 1;
> +
> +	BUG_ON(__vcpu_mode(*vcpu_cpsr(vcpu)) == 0xf);
> +
>  	return 0;
>  }
>  
> +/**
> + * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
> + * @vcpu:	The VCPU pointer
> + * @run:	The kvm_run structure pointer used for userspace state exchange
> + *
> + * This function is called through the VCPU_RUN ioctl called from user space. It
> + * will execute VM code in a loop until the time slice for the process is used
> + * or some emulation is needed from user space in which case the function will
> + * return with return value 0 and with the kvm_run structure filled in with the
> + * required data for the requested emulation.
> + */
>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
>  {
> -	return -EINVAL;
> +	int ret = 0;
> +	int exit_reason;
> +	sigset_t sigsaved;
> +
> +	if (vcpu->sigset_active)
> +		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
> +

We should move this to common code.  But I don't mind if this is done
post merge.

> +	exit_reason = KVM_EXIT_UNKNOWN;
> +	while (exit_reason == KVM_EXIT_UNKNOWN) {

Looping over 'ret' is more in line with x86 and clearer IMO.  x86 uses
the convention: < 0 -> return to userspace with error, 0 -> return to
userspace, 1 -> loop.

> +		/*
> +		 * Check conditions before entering the guest
> +		 */
> +		cond_resched();
> +
> +		update_vttbr(vcpu->kvm);
> +
> +		local_irq_disable();
> +
> +		/* Re-check atomic conditions */
> +		ret = vcpu_pre_guest_enter(vcpu, &exit_reason);
> +		if (ret != 0) {
> +			local_irq_enable();
> +			preempt_enable();
> +			continue;

See - you continue, only to break out of the loop due to a side effect
on exit_reason.

> +		}
> +
> +		/**************************************************************
> +		 * Enter the guest
> +		 */
> +		trace_kvm_entry(vcpu->arch.regs.pc);
> +		kvm_guest_enter();
> +		vcpu->mode = IN_GUEST_MODE;
> +
> +		ret = __kvm_vcpu_run(vcpu);
> +
> +		vcpu->mode = OUTSIDE_GUEST_MODE;
> +		vcpu->stat.exits++;

The tracepoint above should be sufficient for statistics.

> +		kvm_guest_exit();
> +		trace_kvm_exit(vcpu->arch.regs.pc);
> +		local_irq_enable();
> +
> +		/*
> +		 * Back from guest
> +		 *************************************************************/
> +
> +		ret = handle_exit(vcpu, run, ret);
> +		if (ret < 0) {
> +			kvm_err("Error in handle_exit\n");
> +			break;
> +		} else {
> +			exit_reason = ret; /* 0 == KVM_EXIT_UNKNOWN */
> +		}
> +	}
> +
> +	if (vcpu->sigset_active)
> +		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
> +
> +	run->exit_reason = exit_reason;
> +	return ret;
>  }
>  

-- 
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html