On 6/21/19 10:37 AM, Marc Zyngier wrote: > From: Andre Przywara <andre.przywara@xxxxxxx> > > KVM internally uses accessor functions when reading or writing the > guest's system registers. This takes care of accessing either the stored > copy or using the "live" EL1 system registers when the host uses VHE. > > With the introduction of virtual EL2 we add a bunch of EL2 system > registers, which now must also be taken care of: > - If the guest is running in vEL2, and we access an EL1 sysreg, we must > revert to the stored version of that, and not use the CPU's copy. > - If the guest is running in vEL1, and we access an EL2 sysreg, we must > also use the stored version, since the CPU carries the EL1 copy. > - Some EL2 system registers are supposed to affect the current execution > of the system, so we need to put them into their respective EL1 > counterparts. For this we need to define a mapping between the two. > This is done using the newly introduced struct el2_sysreg_map. > - Some EL2 system registers have a different format than their EL1 > counterpart, so we need to translate them before writing them to the > CPU. This is done using an (optional) translate function in the map. > - There are the three special registers SP_EL2, SPSR_EL2 and ELR_EL2, > which need some separate handling. > > All of these cases are now wrapped into the existing accessor functions, > so KVM users wouldn't need to care whether they access EL2 or EL1 > registers and also which state the guest is in. > > This handles what was formerly known as the "shadow state" dynamically, > without requiring a separate copy for each vCPU EL. > > Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx> > Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx> > --- > arch/arm64/include/asm/kvm_emulate.h | 6 + > arch/arm64/include/asm/kvm_host.h | 5 + > arch/arm64/kvm/sys_regs.c | 163 +++++++++++++++++++++++++++ > 3 files changed, 174 insertions(+) > > diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h > index c43aac5fed69..f37006b6eec4 100644 > --- a/arch/arm64/include/asm/kvm_emulate.h > +++ b/arch/arm64/include/asm/kvm_emulate.h > @@ -70,6 +70,12 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu); > int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2); > int kvm_inject_nested_irq(struct kvm_vcpu *vcpu); > > +u64 translate_tcr(u64 tcr); > +u64 translate_cptr(u64 tcr); > +u64 translate_sctlr(u64 tcr); > +u64 translate_ttbr0(u64 tcr); > +u64 translate_cnthctl(u64 tcr); > + > static inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) > { > return !(vcpu->arch.hcr_el2 & HCR_RW); > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h > index 2d4290d2513a..dae9c42a7219 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -217,6 +217,11 @@ enum vcpu_sysreg { > NR_SYS_REGS /* Nothing after this line! */ > }; > > +static inline bool sysreg_is_el2(int reg) > +{ > + return reg >= FIRST_EL2_SYSREG && reg < NR_SYS_REGS; > +} > + > /* 32bit mapping */ > #define c0_MPIDR (MPIDR_EL1 * 2) /* MultiProcessor ID Register */ > #define c0_CSSELR (CSSELR_EL1 * 2)/* Cache Size Selection Register */ > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c > index 693dd063c9c2..d024114da162 100644 > --- a/arch/arm64/kvm/sys_regs.c > +++ b/arch/arm64/kvm/sys_regs.c > @@ -76,11 +76,142 @@ static bool write_to_read_only(struct kvm_vcpu *vcpu, > return false; > } > > +static u64 tcr_el2_ips_to_tcr_el1_ps(u64 tcr_el2) > +{ > + return ((tcr_el2 & TCR_EL2_PS_MASK) >> TCR_EL2_PS_SHIFT) > + << TCR_IPS_SHIFT; > +} > + > +u64 translate_tcr(u64 tcr) > +{ > + return TCR_EPD1_MASK | /* disable TTBR1_EL1 */ > + ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) | > + tcr_el2_ips_to_tcr_el1_ps(tcr) | > + (tcr & TCR_EL2_TG0_MASK) | > + (tcr & TCR_EL2_ORGN0_MASK) | > + (tcr & TCR_EL2_IRGN0_MASK) | > + (tcr & TCR_EL2_T0SZ_MASK); > +} > + > +u64 translate_cptr(u64 cptr_el2) > +{ > + u64 cpacr_el1 = 0; > + > + if (!(cptr_el2 & CPTR_EL2_TFP)) > + cpacr_el1 |= CPACR_EL1_FPEN; > + if (cptr_el2 & CPTR_EL2_TTA) > + cpacr_el1 |= CPACR_EL1_TTA; > + if (!(cptr_el2 & CPTR_EL2_TZ)) > + cpacr_el1 |= CPACR_EL1_ZEN; > + > + return cpacr_el1; > +} > + > +u64 translate_sctlr(u64 sctlr) > +{ > + /* Bit 20 is RES1 in SCTLR_EL1, but RES0 in SCTLR_EL2 */ > + return sctlr | BIT(20); > +} > + > +u64 translate_ttbr0(u64 ttbr0) > +{ > + /* Force ASID to 0 (ASID 0 or RES0) */ > + return ttbr0 & ~GENMASK_ULL(63, 48); > +} > + > +u64 translate_cnthctl(u64 cnthctl) > +{ > + return ((cnthctl & 0x3) << 10) | (cnthctl & 0xfc); > +} > + > +#define EL2_SYSREG(el2, el1, translate) \ > + [el2 - FIRST_EL2_SYSREG] = { el2, el1, translate } > +#define PURE_EL2_SYSREG(el2) \ > + [el2 - FIRST_EL2_SYSREG] = { el2,__INVALID_SYSREG__, NULL } > +/* > + * Associate vEL2 registers to their EL1 counterparts on the CPU. > + * The translate function can be NULL, when the register layout is identical. > + */ > +struct el2_sysreg_map { > + int sysreg; /* EL2 register index into the array above */ > + int mapping; /* associated EL1 register */ > + u64 (*translate)(u64 value); > +} nested_sysreg_map[NR_SYS_REGS - FIRST_EL2_SYSREG] = { > + PURE_EL2_SYSREG( VPIDR_EL2 ), > + PURE_EL2_SYSREG( VMPIDR_EL2 ), > + PURE_EL2_SYSREG( ACTLR_EL2 ), > + PURE_EL2_SYSREG( HCR_EL2 ), > + PURE_EL2_SYSREG( MDCR_EL2 ), > + PURE_EL2_SYSREG( HSTR_EL2 ), > + PURE_EL2_SYSREG( HACR_EL2 ), > + PURE_EL2_SYSREG( VTTBR_EL2 ), > + PURE_EL2_SYSREG( VTCR_EL2 ), > + PURE_EL2_SYSREG( RVBAR_EL2 ), > + PURE_EL2_SYSREG( RMR_EL2 ), > + PURE_EL2_SYSREG( TPIDR_EL2 ), > + PURE_EL2_SYSREG( CNTVOFF_EL2 ), > + PURE_EL2_SYSREG( CNTHCTL_EL2 ), I don't think having CNTHCTL_EL2 as a "pure" EL2 register is the right approach. More details below. > + PURE_EL2_SYSREG( HPFAR_EL2 ), > + EL2_SYSREG( SCTLR_EL2, SCTLR_EL1, translate_sctlr ), > + EL2_SYSREG( CPTR_EL2, CPACR_EL1, translate_cptr ), > + EL2_SYSREG( TTBR0_EL2, TTBR0_EL1, translate_ttbr0 ), > + EL2_SYSREG( TTBR1_EL2, TTBR1_EL1, NULL ), > + EL2_SYSREG( TCR_EL2, TCR_EL1, translate_tcr ), > + EL2_SYSREG( VBAR_EL2, VBAR_EL1, NULL ), > + EL2_SYSREG( AFSR0_EL2, AFSR0_EL1, NULL ), > + EL2_SYSREG( AFSR1_EL2, AFSR1_EL1, NULL ), > + EL2_SYSREG( ESR_EL2, ESR_EL1, NULL ), > + EL2_SYSREG( FAR_EL2, FAR_EL1, NULL ), > + EL2_SYSREG( MAIR_EL2, MAIR_EL1, NULL ), > + EL2_SYSREG( AMAIR_EL2, AMAIR_EL1, NULL ), > +}; > + > +static > +const struct el2_sysreg_map *find_el2_sysreg(const struct el2_sysreg_map *map, > + int reg) > +{ > + const struct el2_sysreg_map *entry; > + > + if (!sysreg_is_el2(reg)) > + return NULL; > + > + entry = &nested_sysreg_map[reg - FIRST_EL2_SYSREG]; > + if (entry->sysreg == __INVALID_SYSREG__) > + return NULL; > + > + return entry; > +} > + > u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) > { > + > if (!vcpu->arch.sysregs_loaded_on_cpu) > goto immediate_read; > > + if (unlikely(sysreg_is_el2(reg))) { > + const struct el2_sysreg_map *el2_reg; > + > + if (!is_hyp_ctxt(vcpu)) > + goto immediate_read; > + > + el2_reg = find_el2_sysreg(nested_sysreg_map, reg); > + if (el2_reg) { > + /* > + * If this register does not have an EL1 counterpart, > + * then read the stored EL2 version. > + */ > + if (el2_reg->mapping == __INVALID_SYSREG__) > + goto immediate_read; With CNTHCTL_EL2 as a "pure" EL2 register, reads (and writes, in vcpu_write_sys_reg) will go to memory. However, when vhe is enabled, CNTHCTL_EL2 has the same format as CNTKCTL_EL1 and reads/writes to CNTKCTL_EL1 should be reflected in the value of CNTHCTL_EL2 according to the pseudocode for accessing CNTKCTL_EL1 (ARM DDI 0487D.b, page D12-3496). This doesn't happen for vhe guest hypervisors because EL2 is declared as a "pure" EL2 register. I have tested that with this hack for reads (function chosen at random): diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 04e554cae3a2..3a6260745680 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -653,8 +653,22 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) */ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { + u64 cntkctl, cnthctl; int ret; + /* Check that CNTKCTL_EL1 writes are redirected to CNTHCTL_EL2 */ + if (has_vhe()) { + /* Check that CNTKCTL_EL1 reads are redirected to CNTHCTL_EL2 */ + cntkctl = read_sysreg(cntkctl_el1); + cnthctl = cntkctl ^ 1; + write_sysreg(cnthctl, cnthctl_el2); + cntkctl = read_sysreg(cntkctl_el1); + BUG_ON(cntkctl != cnthctl); + /* Restore original value */ + cnthctl ^= 1; + write_sysreg(cnthctl, cnthctl_el2); + } + if (unlikely(!kvm_vcpu_initialized(vcpu))) return -ENOEXEC; and this hack for writes: diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 04e554cae3a2..1cfe47b6fa99 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -653,8 +653,21 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) */ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { + u64 cntkctl, cnthctl; int ret; + /* Check that CNTKCTL_EL1 writes are redirected to CNTHCTL_EL2 */ + if (has_vhe()) { + cntkctl = read_sysreg(cntkctl_el1); + cntkctl ^= 1; + write_sysreg(cntkctl, cntkctl_el1); + cnthctl = read_sysreg(cnthctl_el2); + BUG_ON(cntkctl != cnthctl); + /* Restore original value */ + cntkctl ^= 1; + write_sysreg(cntkctl, cntkctl_el1); + } + if (unlikely(!kvm_vcpu_initialized(vcpu))) return -ENOEXEC; The BUG_ON is not triggered on baremetal, but is triggered when running as a L1 guest hypervisor. Another issue with CNTHCTL_EL2 being a "pure" EL2 register is that with non-vhe guests, writes to CNTHCTL_EL2 aren't translated and written to CNTKCTL_EL1. This patch seems to fix the issues with vhe and non-vhe guest hypervisors (tested with booting a L2 guest): diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1235a88ec575..bd21f0f45a86 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -153,7 +153,6 @@ struct el2_sysreg_map { PURE_EL2_SYSREG( RVBAR_EL2 ), PURE_EL2_SYSREG( RMR_EL2 ), PURE_EL2_SYSREG( TPIDR_EL2 ), - PURE_EL2_SYSREG( CNTHCTL_EL2 ), PURE_EL2_SYSREG( HPFAR_EL2 ), EL2_SYSREG( SCTLR_EL2, SCTLR_EL1, translate_sctlr ), EL2_SYSREG( CPTR_EL2, CPACR_EL1, translate_cptr ), @@ -167,6 +166,7 @@ struct el2_sysreg_map { EL2_SYSREG( FAR_EL2, FAR_EL1, NULL ), EL2_SYSREG( MAIR_EL2, MAIR_EL1, NULL ), EL2_SYSREG( AMAIR_EL2, AMAIR_EL1, NULL ), + EL2_SYSREG( CNTHCTL_EL2,CNTKCTL_EL1, translate_cnthctl), }; static > + > + /* Get the current version of the EL1 counterpart. */ > + reg = el2_reg->mapping; > + } > + } else { > + /* EL1 register can't be on the CPU if the guest is in vEL2. */ > + if (unlikely(is_hyp_ctxt(vcpu))) > + goto immediate_read; > + } > + > /* > * System registers listed in the switch are not saved on every > * exit from the guest but are only saved on vcpu_put. > @@ -114,6 +245,8 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) > case DACR32_EL2: return read_sysreg_s(SYS_DACR32_EL2); > case IFSR32_EL2: return read_sysreg_s(SYS_IFSR32_EL2); > case DBGVCR32_EL2: return read_sysreg_s(SYS_DBGVCR32_EL2); > + case SP_EL2: return read_sysreg(sp_el1); > + case ELR_EL2: return read_sysreg_el1(SYS_ELR); > } > > immediate_read: > @@ -125,6 +258,34 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) > if (!vcpu->arch.sysregs_loaded_on_cpu) > goto immediate_write; > > + if (unlikely(sysreg_is_el2(reg))) { > + const struct el2_sysreg_map *el2_reg; > + > + if (!is_hyp_ctxt(vcpu)) > + goto immediate_write; > + > + /* Store the EL2 version in the sysregs array. */ > + __vcpu_sys_reg(vcpu, reg) = val; > + > + el2_reg = find_el2_sysreg(nested_sysreg_map, reg); > + if (el2_reg) { > + /* Does this register have an EL1 counterpart? */ > + if (el2_reg->mapping == __INVALID_SYSREG__) > + return; > + > + if (!vcpu_el2_e2h_is_set(vcpu) && > + el2_reg->translate) > + val = el2_reg->translate(val); > + > + /* Redirect this to the EL1 version of the register. */ > + reg = el2_reg->mapping; > + } > + } else { > + /* EL1 register can't be on the CPU if the guest is in vEL2. */ > + if (unlikely(is_hyp_ctxt(vcpu))) > + goto immediate_write; > + } > + > /* > * System registers listed in the switch are not restored on every > * entry to the guest but are only restored on vcpu_load. > @@ -157,6 +318,8 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) > case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); return; > case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); return; > case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); return; > + case SP_EL2: write_sysreg(val, sp_el1); return; > + case ELR_EL2: write_sysreg_el1(val, SYS_ELR); return; > } > > immediate_write: