Re: [PATCH 16/59] KVM: arm64: nv: Save/Restore vEL2 sysregs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 6/21/19 10:38 AM, Marc Zyngier wrote:
> From: Andre Przywara <andre.przywara@xxxxxxx>
>
> Whenever we need to restore the guest's system registers to the CPU, we
> now need to take care of the EL2 system registers as well. Most of them
> are accessed via traps only, but some have an immediate effect and also
> a guest running in VHE mode would expect them to be accessible via their
> EL1 encoding, which we do not trap.
>
> Split the current __sysreg_{save,restore}_el1_state() functions into
> handling common sysregs, then differentiate between the guest running in
> vEL2 and vEL1.
>
> For vEL2 we write the virtual EL2 registers with an identical format directly
> into their EL1 counterpart, and translate the few registers that have a
> different format for the same effect on the execution when running a
> non-VHE guest guest hypervisor.
>
>   [ Commit message reworked and many bug fixes applied by Marc Zyngier
>     and Christoffer Dall. ]
>
> Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx>
> Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx>
> Signed-off-by: Christoffer Dall <christoffer.dall@xxxxxxx>
> ---
>  arch/arm64/kvm/hyp/sysreg-sr.c | 160 +++++++++++++++++++++++++++++++--
>  1 file changed, 153 insertions(+), 7 deletions(-)
>
> diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
> index 62866a68e852..2abb9c3ff24f 100644
> --- a/arch/arm64/kvm/hyp/sysreg-sr.c
> +++ b/arch/arm64/kvm/hyp/sysreg-sr.c
> @@ -22,6 +22,7 @@
>  #include <asm/kvm_asm.h>
>  #include <asm/kvm_emulate.h>
>  #include <asm/kvm_hyp.h>
> +#include <asm/kvm_nested.h>
>  
>  /*
>   * Non-VHE: Both host and guest must save everything.
> @@ -51,11 +52,9 @@ static void __hyp_text __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
>  	ctxt->sys_regs[TPIDRRO_EL0]	= read_sysreg(tpidrro_el0);
>  }
>  
> -static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
> +static void __hyp_text __sysreg_save_vel1_state(struct kvm_cpu_context *ctxt)
>  {
> -	ctxt->sys_regs[CSSELR_EL1]	= read_sysreg(csselr_el1);
>  	ctxt->sys_regs[SCTLR_EL1]	= read_sysreg_el1(SYS_SCTLR);
> -	ctxt->sys_regs[ACTLR_EL1]	= read_sysreg(actlr_el1);
>  	ctxt->sys_regs[CPACR_EL1]	= read_sysreg_el1(SYS_CPACR);
>  	ctxt->sys_regs[TTBR0_EL1]	= read_sysreg_el1(SYS_TTBR0);
>  	ctxt->sys_regs[TTBR1_EL1]	= read_sysreg_el1(SYS_TTBR1);
> @@ -69,14 +68,58 @@ static void __hyp_text __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
>  	ctxt->sys_regs[CONTEXTIDR_EL1]	= read_sysreg_el1(SYS_CONTEXTIDR);
>  	ctxt->sys_regs[AMAIR_EL1]	= read_sysreg_el1(SYS_AMAIR);
>  	ctxt->sys_regs[CNTKCTL_EL1]	= read_sysreg_el1(SYS_CNTKCTL);
> -	ctxt->sys_regs[PAR_EL1]		= read_sysreg(par_el1);
> -	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
>  
>  	ctxt->gp_regs.sp_el1		= read_sysreg(sp_el1);
>  	ctxt->gp_regs.elr_el1		= read_sysreg_el1(SYS_ELR);
>  	ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(SYS_SPSR);
>  }
>  
> +static void __sysreg_save_vel2_state(struct kvm_cpu_context *ctxt)
> +{
> +	ctxt->sys_regs[ESR_EL2]		= read_sysreg_el1(SYS_ESR);
> +	ctxt->sys_regs[AFSR0_EL2]	= read_sysreg_el1(SYS_AFSR0);
> +	ctxt->sys_regs[AFSR1_EL2]	= read_sysreg_el1(SYS_AFSR1);
> +	ctxt->sys_regs[FAR_EL2]		= read_sysreg_el1(SYS_FAR);
> +	ctxt->sys_regs[MAIR_EL2]	= read_sysreg_el1(SYS_MAIR);
> +	ctxt->sys_regs[VBAR_EL2]	= read_sysreg_el1(SYS_VBAR);
> +	ctxt->sys_regs[CONTEXTIDR_EL2]	= read_sysreg_el1(SYS_CONTEXTIDR);
> +	ctxt->sys_regs[AMAIR_EL2]	= read_sysreg_el1(SYS_AMAIR);
> +
> +	/*
> +	 * In VHE mode those registers are compatible between EL1 and EL2,
> +	 * and the guest uses the _EL1 versions on the CPU naturally.
> +	 * So we save them into their _EL2 versions here.
> +	 * For nVHE mode we trap accesses to those registers, so our
> +	 * _EL2 copy in sys_regs[] is always up-to-date and we don't need
> +	 * to save anything here.
> +	 */
> +	if (__vcpu_el2_e2h_is_set(ctxt)) {
> +		ctxt->sys_regs[SCTLR_EL2]	= read_sysreg_el1(SYS_SCTLR);
> +		ctxt->sys_regs[CPTR_EL2]	= read_sysreg_el1(SYS_CPACR);
> +		ctxt->sys_regs[TTBR0_EL2]	= read_sysreg_el1(SYS_TTBR0);
> +		ctxt->sys_regs[TTBR1_EL2]	= read_sysreg_el1(SYS_TTBR1);
> +		ctxt->sys_regs[TCR_EL2]		= read_sysreg_el1(SYS_TCR);
> +		ctxt->sys_regs[CNTHCTL_EL2]	= read_sysreg_el1(SYS_CNTKCTL);
> +	}

This can break guests that run with VHE on, then disable it. I stumbled into
this while working on kvm-unit-tests, which uses TTBR0 for the translation
tables. Let's consider the following scenario:

1. Guest sets HCR_EL2.E2H
2. Guest programs translation tables in TTBR0_EL1, which should reflect in
TTBR0_EL2.
3. Guest enabled MMU and does stuff.
4. Guest disables MMU and clears HCR_EL2.E2H
5. Guest turns MMU on. It doesn't change TTBR0_EL2, because it will use the same
translation tables as when running with E2H set.
6. The vcpu gets scheduled out. E2H is not set, so the value that the guest
programmed in hardware TTBR0_EL1 won't be copied to virtual TTBR0_EL2.
7. The vcpu gets scheduled back in. KVM will write the reset value for virtual
TTBR0_EL2 (which is 0x0).
8. The guest hangs.

I think this is actually a symptom of a deeper issue. When E2H is cleared, the
values that the guest wrote to the EL1 registers aren't immediately reflected in
the virtual EL2 registers, as it happens on real hardware. Instead, some of the
hardware values from the EL1 registers are copied to the corresponding EL2
registers on the next vcpu_put, which happens at a later time.

I am thinking that something like this will fix the issues (it did fix disabling
VHE in kvm-unit-tests):

diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1d896113f1f8..f2b5a39762d0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -333,7 +333,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
                 * to reverse-translate virtual EL2 system registers for a
                 * non-VHE guest hypervisor.
                 */
-               __vcpu_sys_reg(vcpu, reg) = val;
+               if (reg != HCR_EL2)
+                       __vcpu_sys_reg(vcpu, reg) = val;
 
                switch (reg) {
                case ELR_EL2:
@@ -370,7 +371,17 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int
reg)
                return;
 
 memory_write:
-        __vcpu_sys_reg(vcpu, reg) = val;
+       if (reg == HCR_EL2 && vcpu_el2_e2h_is_set(vcpu) && !(val & HCR_E2H)) {
+               preempt_disable();
+               kvm_arch_vcpu_put(vcpu);
+
+               __vcpu_sys_reg(vcpu, reg) = val;
+
+               kvm_arch_vcpu_load(vcpu, smp_processor_id());
+               preempt_enable();
+       } else {
+                __vcpu_sys_reg(vcpu, reg) = val;
+       }
 }
 
 /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */

I don't think there's any need to convert EL1 registers to their non-vhe EL2
format because of how RES0/RES1 is defined in the architecture glossary (ARM DDI
0487E.a, page 7893 for RES0 and 7894 for RES1):

"If a bit is RES0 only in some contexts:
A read of the bit must return the value last successfully written to the bit, by
either a direct or an indirect write, regardless of the use of the register when
the bit was written
[..]
While the use of the register is such that the bit is described as RES0, the
value of the bit must have no effect on the operation of the PE, other than
determining the value read back from that bit, unless this Manual explicitly
defines additional properties for the bit"

We have the translate functions that should take care of converting the non-vhe
EL2 format to the hardware EL1 format.

As an aside, the diff looks weird because the vcpu_write_sys_reg is very
complex, there are a LOT of exit points from the function, and the register
value is written twice for some registers. I think it's worth considering making
the function simpler, maybe splitting it into two separate functions, one for
EL2 registers, one for regular registers.

Here's the kvm-unit-tests diff that I used to spot the bug. It's very far from
being correct, but the test is able to finish with the fix (it hangs otherwise).
You can apply it on top of 2130fd4154ad ("tscdeadline_latency: Check condition
first before loop"):

diff --git a/arm/cstart64.S b/arm/cstart64.S
index b0e8baa1a23a..01357b3b116b 100644
--- a/arm/cstart64.S
+++ b/arm/cstart64.S
@@ -51,6 +51,18 @@ start:
     b    1b
 
 1:
+    mrs    x4, CurrentEL
+    cmp    x4, CurrentEL_EL2
+    b.ne    1f
+    mrs    x4, mpidr_el1
+    msr    vmpidr_el2, x4
+    mrs    x4, midr_el1
+    msr    vpidr_el2, x4
+    ldr    x4, =(HCR_EL2_TGE | HCR_EL2_E2H)
+    msr    hcr_el2, x4
+    isb
+
+1:
     /* set up stack */
     mov    x4, #1
     msr    spsel, x4
@@ -101,6 +113,18 @@ get_mmu_off:
 
 .globl secondary_entry
 secondary_entry:
+    mrs    x0, CurrentEL
+    cmp    x0, CurrentEL_EL2
+    b.ne    1f
+    mrs    x0, mpidr_el1
+    msr    vmpidr_el2, x0
+    mrs    x0, midr_el1
+    msr    vpidr_el2, x0
+    ldr    x0, =(HCR_EL2_TGE | HCR_EL2_E2H)
+    msr    hcr_el2, x0
+    isb
+
+1:
     /* Enable FP/ASIMD */
     mov    x0, #(3 << 20)
     msr    cpacr_el1, x0
@@ -194,6 +218,33 @@ asm_mmu_enable:
 
     ret
 
+asm_mmu_enable_hyp:
+       ic      iallu
+       tlbi    alle2is
+       dsb     ish
+
+        /* TCR */
+       ldr     x1, =TCR_EL2_RES1 |                     \
+                    TCR_T0SZ(VA_BITS) |                \
+                    TCR_TG0_64K |                      \
+                    TCR_IRGN0_WBWA | TCR_ORGN0_WBWA |  \
+                    TCR_SH0_IS
+       mrs     x2, id_aa64mmfr0_el1
+       bfi     x1, x2, #TCR_EL2_PS_SHIFT, #3
+       msr     tcr_el2, x1
+
+       /* Same MAIR and TTBR0 as in VHE mode */
+
+       /* SCTLR */
+       ldr     x1, =SCTLR_EL2_RES1 |                   \
+                    SCTLR_EL2_C |                      \
+                    SCTLR_EL2_I |                      \
+                    SCTLR_EL2_M
+       msr     sctlr_el2, x1
+       isb
+
+       ret
+
 .globl asm_mmu_disable
 asm_mmu_disable:
     mrs    x0, sctlr_el1
@@ -202,6 +253,18 @@ asm_mmu_disable:
     isb
     ret
 
+.globl asm_disable_vhe
+asm_disable_vhe:
+    str     x30, [sp, #-16]!
+
+    bl      asm_mmu_disable
+    msr     hcr_el2, xzr
+    isb
+    bl      asm_mmu_enable_hyp
+
+    ldr     x30, [sp], #16
+    ret
+
 /*
  * Vectors
  * Adapted from arch/arm64/kernel/entry.S
diff --git a/arm/selftest.c b/arm/selftest.c
index 28a17f7a7531..68a18036221b 100644
--- a/arm/selftest.c
+++ b/arm/selftest.c
@@ -287,6 +287,12 @@ static void user_psci_system_off(struct pt_regs *regs,
unsigned int esr)
 {
     __user_psci_system_off();
 }
+
+extern void asm_disable_vhe(void);
+static void check_el2(void)
+{
+    asm_disable_vhe();
+}
 #endif
 
 static void check_vectors(void *arg __unused)
@@ -369,6 +375,10 @@ int main(int argc, char **argv)
         report("PSCI version", psci_check());
         on_cpus(cpu_report, NULL);
 
+    } else if (strcmp(argv[1], "el2") == 0) {
+
+        check_el2();
+
     } else {
         printf("Unknown subtest\n");
         abort();
diff --git a/lib/arm/psci.c b/lib/arm/psci.c
index c3d399064ae3..5ef1c0386ce1 100644
--- a/lib/arm/psci.c
+++ b/lib/arm/psci.c
@@ -16,7 +16,7 @@ int psci_invoke(unsigned long function_id, unsigned long arg0,
         unsigned long arg1, unsigned long arg2)
 {
     asm volatile(
-        "hvc #0"
+        "smc #0"
     : "+r" (function_id)
     : "r" (arg0), "r" (arg1), "r" (arg2));
     return function_id;
diff --git a/lib/arm64/asm/pgtable-hwdef.h b/lib/arm64/asm/pgtable-hwdef.h
index 045a3ce12645..6b80e34dda0c 100644
--- a/lib/arm64/asm/pgtable-hwdef.h
+++ b/lib/arm64/asm/pgtable-hwdef.h
@@ -95,18 +95,42 @@
 /*
  * TCR flags.
  */
-#define TCR_TxSZ(x)        (((UL(64) - (x)) << 16) | ((UL(64) - (x)) << 0))
-#define TCR_IRGN_NC        ((UL(0) << 8) | (UL(0) << 24))
-#define TCR_IRGN_WBWA        ((UL(1) << 8) | (UL(1) << 24))
-#define TCR_IRGN_WT        ((UL(2) << 8) | (UL(2) << 24))
-#define TCR_IRGN_WBnWA        ((UL(3) << 8) | (UL(3) << 24))
-#define TCR_IRGN_MASK        ((UL(3) << 8) | (UL(3) << 24))
-#define TCR_ORGN_NC        ((UL(0) << 10) | (UL(0) << 26))
-#define TCR_ORGN_WBWA        ((UL(1) << 10) | (UL(1) << 26))
-#define TCR_ORGN_WT        ((UL(2) << 10) | (UL(2) << 26))
-#define TCR_ORGN_WBnWA        ((UL(3) << 10) | (UL(3) << 26))
-#define TCR_ORGN_MASK        ((UL(3) << 10) | (UL(3) << 26))
-#define TCR_SHARED        ((UL(3) << 12) | (UL(3) << 28))
+#define TCR_T0SZ(x)        ((UL(64) - (x)) << 0)
+#define TCR_T1SZ(x)        ((UL(64) - (x)) << 16)
+#define TCR_TxSZ(x)        (TCR_T0SZ(x) | TCR_T1SZ(x))
+#define TCR_IRGN0_NC        (UL(0) << 8)
+#define TCR_IRGN1_NC        (UL(0) << 24)
+#define TCR_IRGN_NC        (TCR_IRGN0_NC | TCR_IRGN1_NC)
+#define TCR_IRGN0_WBWA        (UL(1) << 8)
+#define TCR_IRGN1_WBWA        (UL(1) << 24)
+#define TCR_IRGN_WBWA        (TCR_IRGN0_WBWA | TCR_IRGN1_WBWA)
+#define TCR_IRGN0_WT        (UL(2) << 8)
+#define TCR_IRGN1_WT        (UL(2) << 24)
+#define TCR_IRGN_WT        (TCR_IRGN0_WT | TCR_IRGN1_WT)
+#define TCR_IRGN0_WBnWA        (UL(3) << 8)
+#define TCR_IRGN1_WBnWA        (UL(3) << 24)
+#define TCR_IRGN_WBnWA        (TCR_IRGN0_WBnWA | TCR_IRGN1_WBnWA)
+#define TCR_IRGN0_MASK        (UL(3) << 8)
+#define TCR_IRGN1_MASK        (UL(3) << 24)
+#define TCR_IRGN_MASK        (TCR_IRGN0_MASK | TCR_IRGN1_MASK)
+#define TCR_ORGN0_NC        (UL(0) << 10)
+#define TCR_ORGN1_NC        (UL(0) << 26)
+#define TCR_ORGN_NC        (TCR_ORGN0_NC | TCR_ORGN1_NC)
+#define TCR_ORGN0_WBWA        (UL(1) << 10)
+#define TCR_ORGN1_WBWA        (UL(1) << 26)
+#define TCR_ORGN_WBWA        (TCR_ORGN0_WBWA | TCR_ORGN1_WBWA)
+#define TCR_ORGN0_WT        (UL(2) << 10)
+#define TCR_ORGN1_WT        (UL(2) << 26)
+#define TCR_ORGN_WT        (TCR_ORGN0_WT | TCR_ORGN1_WT)
+#define TCR_ORGN0_WBnWA        (UL(3) << 8)
+#define TCR_ORGN1_WBnWA        (UL(3) << 24)
+#define TCR_ORGN_WBnWA        (TCR_ORGN0_WBnWA | TCR_ORGN1_WBnWA)
+#define TCR_ORGN0_MASK        (UL(3) << 10)
+#define TCR_ORGN1_MASK        (UL(3) << 26)
+#define TCR_ORGN_MASK        (TCR_ORGN0_MASK | TCR_ORGN1_MASK)
+#define TCR_SH0_IS        (UL(3) << 12)
+#define TCR_SH1_IS        (UL(3) << 28)
+#define TCR_SHARED        (TCR_SH0_IS | TCR_SH1_IS)
 #define TCR_TG0_4K        (UL(0) << 14)
 #define TCR_TG0_64K        (UL(1) << 14)
 #define TCR_TG0_16K        (UL(2) << 14)
@@ -116,6 +140,9 @@
 #define TCR_ASID16        (UL(1) << 36)
 #define TCR_TBI0        (UL(1) << 37)
 
+#define TCR_EL2_RES1        ((UL(1) << 31) | (UL(1) << 23))
+#define TCR_EL2_PS_SHIFT    16
+
 /*
  * Memory types available.
  */
diff --git a/lib/arm64/asm/processor.h b/lib/arm64/asm/processor.h
index 1d9223f728a5..b2136acda743 100644
--- a/lib/arm64/asm/processor.h
+++ b/lib/arm64/asm/processor.h
@@ -16,6 +16,16 @@
 #define SCTLR_EL1_A    (1 << 1)
 #define SCTLR_EL1_M    (1 << 0)
 
+#define HCR_EL2_TGE    (1 << 27)
+#define HCR_EL2_E2H    (1 << 34)
+
+#define SCTLR_EL2_RES1    ((UL(3) << 28) | (UL(3) << 22) |    \
+            (UL(1) << 18) | (UL(1) << 16) |        \
+            (UL(1) << 11) | (UL(3) << 4))
+#define SCTLR_EL2_I    SCTLR_EL1_I
+#define SCTLR_EL2_C    SCTLR_EL1_C
+#define SCTLR_EL2_M    SCTLR_EL1_M
+
 #ifndef __ASSEMBLY__
 #include <asm/ptrace.h>
 #include <asm/esr.h>


To run it:

lkvm run -f selftest.flat -c 1 -m 128 -p el2 --nested --irqchip gicv3 --console
serial

Thanks,
Alex



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux