Hi Marc, On Thu, Feb 09, 2023 at 05:58:10PM +0000, Marc Zyngier wrote: > From: Jintack Lim <jintack.lim@xxxxxxxxxx> > > ARM v8.3 introduces a new bit in the HCR_EL2, which is the NV bit. When > this bit is set, accessing EL2 registers in EL1 traps to EL2. In > addition, executing the following instructions in EL1 will trap to EL2: > tlbi, at, eret, and msr/mrs instructions to access SP_EL1. Most of the > instructions that trap to EL2 with the NV bit were undef at EL1 prior to > ARM v8.3. The only instruction that was not undef is eret. > > This patch sets up a handler for EL2 registers and SP_EL1 register > accesses at EL1. The host hypervisor keeps those register values in > memory, and will emulate their behavior. > > This patch doesn't set the NV bit yet. It will be set in a later patch > once nested virtualization support is completed. > > Reviewed-by: Russell King (Oracle) <rmk+kernel@xxxxxxxxxxxxxxx> > Signed-off-by: Jintack Lim <jintack.lim@xxxxxxxxxx> > [maz: EL2_REG() macros] > Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx> [..] > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c > index c6cbfe6b854b..1e6ae3b2e6dd 100644 > --- a/arch/arm64/kvm/sys_regs.c > +++ b/arch/arm64/kvm/sys_regs.c > @@ -24,6 +24,7 @@ > #include <asm/kvm_emulate.h> > #include <asm/kvm_hyp.h> > #include <asm/kvm_mmu.h> > +#include <asm/kvm_nested.h> > #include <asm/perf_event.h> > #include <asm/sysreg.h> > > @@ -102,6 +103,18 @@ static u32 get_ccsidr(u32 csselr) > return ccsidr; > } > > +static bool access_rw(struct kvm_vcpu *vcpu, > + struct sys_reg_params *p, > + const struct sys_reg_desc *r) > +{ > + if (p->is_write) > + vcpu_write_sys_reg(vcpu, p->regval, r->reg); > + else > + p->regval = vcpu_read_sys_reg(vcpu, r->reg); > + > + return true; > +} > + > /* > * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). > */ > @@ -260,6 +273,14 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu, > return read_zero(vcpu, p); > } > > +static bool trap_undef(struct kvm_vcpu *vcpu, > + struct sys_reg_params *p, > + const struct sys_reg_desc *r) > +{ > + kvm_inject_undefined(vcpu); > + return false; > +} > + > /* > * ARMv8.1 mandates at least a trivial LORegion implementation, where all the > * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0 > @@ -370,12 +391,9 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, > struct sys_reg_params *p, > const struct sys_reg_desc *r) > { > - if (p->is_write) { > - vcpu_write_sys_reg(vcpu, p->regval, r->reg); > + access_rw(vcpu, p, r); > + if (p->is_write) > vcpu_set_flag(vcpu, DEBUG_DIRTY); > - } else { > - p->regval = vcpu_read_sys_reg(vcpu, r->reg); > - } > > trace_trap_reg(__func__, r->reg, p->is_write, p->regval); > > @@ -1446,6 +1464,24 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, > .visibility = mte_visibility, \ > } > > +static unsigned int el2_visibility(const struct kvm_vcpu *vcpu, > + const struct sys_reg_desc *rd) > +{ > + if (vcpu_has_nv(vcpu)) > + return 0; > + > + return REG_HIDDEN; > +} > + > +#define EL2_REG(name, acc, rst, v) { \ > + SYS_DESC(SYS_##name), \ > + .access = acc, \ > + .reset = rst, \ > + .reg = name, \ > + .visibility = el2_visibility, \ > + .val = v, \ > +} > + > /* sys_reg_desc initialiser for known cpufeature ID registers */ > #define ID_SANITISED(name) { \ > SYS_DESC(SYS_##name), \ > @@ -1490,6 +1526,18 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, > .visibility = raz_visibility, \ > } > > +static bool access_sp_el1(struct kvm_vcpu *vcpu, > + struct sys_reg_params *p, > + const struct sys_reg_desc *r) > +{ > + if (p->is_write) > + __vcpu_sys_reg(vcpu, SP_EL1) = p->regval; > + else > + p->regval = __vcpu_sys_reg(vcpu, SP_EL1); > + > + return true; > +} > + > /* > * Architected system registers. > * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 > @@ -1913,9 +1961,50 @@ static const struct sys_reg_desc sys_reg_descs[] = { > { PMU_SYS_REG(SYS_PMCCFILTR_EL0), .access = access_pmu_evtyper, > .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 }, > > + EL2_REG(VPIDR_EL2, access_rw, reset_unknown, 0), > + EL2_REG(VMPIDR_EL2, access_rw, reset_unknown, 0), > + EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1), > + EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), > + EL2_REG(HCR_EL2, access_rw, reset_val, 0), > + EL2_REG(MDCR_EL2, access_rw, reset_val, 0), > + EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ), > + EL2_REG(HSTR_EL2, access_rw, reset_val, 0), > + EL2_REG(HACR_EL2, access_rw, reset_val, 0), > + > + EL2_REG(TTBR0_EL2, access_rw, reset_val, 0), > + EL2_REG(TTBR1_EL2, access_rw, reset_val, 0), > + EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1), > + EL2_REG(VTTBR_EL2, access_rw, reset_val, 0), > + EL2_REG(VTCR_EL2, access_rw, reset_val, 0), > + > { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, > + EL2_REG(SPSR_EL2, access_rw, reset_val, 0), > + EL2_REG(ELR_EL2, access_rw, reset_val, 0), > + { SYS_DESC(SYS_SP_EL1), access_sp_el1}, > + > { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, > + EL2_REG(AFSR0_EL2, access_rw, reset_val, 0), > + EL2_REG(AFSR1_EL2, access_rw, reset_val, 0), > + EL2_REG(ESR_EL2, access_rw, reset_val, 0), > { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 }, > + > + EL2_REG(FAR_EL2, access_rw, reset_val, 0), > + EL2_REG(HPFAR_EL2, access_rw, reset_val, 0), > + > + EL2_REG(MAIR_EL2, access_rw, reset_val, 0), > + EL2_REG(AMAIR_EL2, access_rw, reset_val, 0), > + > + EL2_REG(VBAR_EL2, access_rw, reset_val, 0), > + EL2_REG(RVBAR_EL2, access_rw, reset_val, 0), > + { SYS_DESC(SYS_RMR_EL2), trap_undef }, > + > + EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), > + EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), > + > + EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0), > + EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), > + > + EL2_REG(SP_EL2, NULL, reset_unknown, 0), > }; > > static bool trap_dbgdidr(struct kvm_vcpu *vcpu, > -- > 2.34.1 > > I'm having an issue with this commit where a VCPU is getting a CNTVOFF_EL2 set to 0, so it sees the same time as the host system, and the other VCPU has the correct offset. This is due to the new line added in this commit: EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0), Here is an excerpt of the dmesg: [ 0.000000] Booting Linux on physical CPU 0x0000000000 [0x000f0510] [ 0.000000] Linux version 5.14.0-rc1-00011-gc2b59ec4a322-dirty SMP PREEMPT [ 0.000000] Machine model: linux,dummy-virt [ 0.000000] efi: UEFI not found. [ 0.000000] earlycon: ns16550a0 at MMIO 0x0000000001000000 (options '') [ 0.000000] printk: bootconsole [ns16550a0] enabled [ 0.000000] NUMA: No NUMA configuration found [ 0.000000] NUMA: Faking a node at [mem 0x0000000080000000-0x0000000093ffffff] [ 0.000000] NUMA: NODE_DATA [mem 0x93f60c00-0x93f62fff] [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000080000000-0x0000000093ffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal empty [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000080000000-0x0000000093ffffff] [ 0.000000] Initmem setup node 0 [mem 0x0000000080000000-0x0000000093ffffff] [ 0.000000] On node 0, zone DMA: 16384 pages in unavailable ranges [ 0.000000] cma: Reserved 32 MiB at 0x0000000091800000 [ 0.000000] psci: probing for conduit method from DT. [ 0.000000] psci: PSCIv1.1 detected in firmware. [ 0.000000] psci: Using standard PSCI v0.2 function IDs [ 0.000000] psci: Trusted OS migration not required [ 0.000000] psci: SMC Calling Convention v1.1 [ 0.000000] smccc: KVM: hypervisor services detected (0x00000000 0x00000000 0x00000000 0x00000003) [ 0.000000] percpu: Embedded 33 pages/cpu s96088 r8192 d30888 u135168 [ 0.000000] Detected PIPT I-cache on CPU0 [ 0.000000] CPU features: detected: Branch Target Identification [ 0.000000] CPU features: detected: GIC system register CPU interface [ 0.000000] CPU features: detected: Spectre-v4 [ 0.000000] CPU features: kernel page table isolation forced ON by KASLR [ 0.000000] CPU features: detected: Kernel page table isolation (KPTI) [..] [ 0.516017] fsl-mc MSI: msic domain created [ 0.544449] EFI services will not be available. [ 0.575385] smp: Bringing up secondary CPUs ... [ 0.658517] smp: Brought up 1 node, 2 CPUs [ 0.849724] SMP: Total of 2 processors activated. [ 0.893825] CPU features: detected: 32-bit EL0 Support [ 0.935321] CPU features: detected: 32-bit EL1 Support [ 0.989719] CPU features: detected: ARMv8.4 Translation Table Level [ 1.049576] CPU features: detected: Data cache clean to the PoU not required for I/D coherence [ 1.108269] CPU features: detected: Common not Private translations [ 1.151863] CPU features: detected: CRC32 instructions [ 1.186088] CPU features: detected: RCpc load-acquire (LDAPR) [ 1.225008] CPU features: detected: LSE atomic instructions [ 1.262173] CPU features: detected: Privileged Access Never [ 1.299687] CPU features: detected: RAS Extension Support [ 1.337388] CPU features: detected: Random Number Generator [ 1.374957] CPU features: detected: Speculation barrier (SB) [ 1.412860] CPU features: detected: Stage-2 Force Write-Back [ 1.458967] CPU features: detected: TLB range maintenance instructions [ 1.503504] CPU features: detected: Speculative Store Bypassing Safe (SSBS) [ 6.740431] CPU: All CPU(s) started at EL1 [ 6.779526] alternatives: patching kernel code [ 1685.522538] devtmpfs: initialized [ 1685.591049] KASLR enabled [ 1685.614942] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns [ 1685.686698] futex hash table entries: 512 (order: 3, 32768 bytes, linear) [ 1685.748071] pinctrl core: initialized pinctrl subsystem [ 1685.809348] DMI not present or invalid. [ 1685.854306] NET: Registered PF_NETLINK/PF_ROUTE protocol family [ 1685.924942] DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations [ 1685.981774] DMA: preallocated 128 KiB GFP_KERNEL|GFP_DMA pool for atomic allocations [ 1686.042387] DMA: preallocated 128 KiB GFP_KERNEL|GFP_DMA32 pool for atomic allocations [ 1686.099864] audit: initializing netlink subsys (disabled) [ 1686.152647] audit: type=2000 audit(5.488:1): state=initialized audit_enabled=0 res=1 [ 7.525335] thermal_sys: Registered thermal governor 'step_wise' [ 7.565113] thermal_sys: Registered thermal governor 'power_allocator' [ 7.609143] cpuidle: using governor menu [ 7.681526] NET: Registered PF_QIPCRTR protocol family [ 7.720877] hw-breakpoint: found 6 breakpoint and 4 watchpoint registers. [ 7.770844] ASID allocator initialised with 32768 entries [ 7.816662] Serial: AMBA PL011 UART driver [ 1686.613910] HugeTLB registered 1.00 GiB page size, pre-allocated 0 pages [ 1686.666303] HugeTLB registered 32.0 MiB page size, pre-allocated 0 pages [ 1686.725358] HugeTLB registered 2.00 MiB page size, pre-allocated 0 pages [ 1686.772979] HugeTLB registered 64.0 KiB page size, pre-allocated 0 pages [..] The flow of execution looks like this: KVM_CREATE_VCPU 0 (VMM) -> kvm_timer_vcpu_init -> update_vtimer_cntvoff (VCPU0.CNTVOFF_EL2=kvm_phys_timer_read) KVM_ARM_VCPU_INIT (VMM) -> kvm_arch_vcpu_ioctl_vcpu_init -> kvm_vcpu_set_target -> kvm_reset_vcpu -> kvm_reset_sys_regs (VCPU0.CNTVOFF_EL2=0) KVM_CREATE_VCPU 1 (VMM) -> kvm_timer_vcpu_init -> update_vtimer_cntvoff (VCPU0.CNTVOFF_EL2=VCPU1.CNTVOFF_EL2=kvm_phys_timer_read) KVM_ARM_VCPU_INIT (VMM) -> kvm_arch_vcpu_ioctl_vcpu_init -> kvm_vcpu_set_target -> kvm_reset_vcpu -> kvm_reset_sys_regs (VCPU1.CNTVOFF_EL2=0) At this point VCPU0 has CNTVOFF_EL2 == kvm_phys_timer_read, and VCPU1 has CNTVOFF_EL2 == 0. The VCPUs having different CNTVOFF_EL2 valuess is just a symptom of the fact that CNTVOFF_EL2 is now reset in kvm_reset_sys_regs. This is with linux kvmarm/next [1], kvmtool [2], qemu [3] (I also saw similar behaviour on FVP). qemu-system-aarch64 -M virt \ -machine virtualization=true \ -machine virt,gic-version=3 \ -machine mte=off \ -cpu max,pauth=off -smp 2 -m 12144 \ -drive if=none,format=qcow2,file=disk.img,id=blk0 \ -device virtio-blk-pci,drive=blk0 \ -nographic \ -device virtio-net-pci,netdev=net0 \ -netdev user,id=net0,hostfwd=tcp::8024-:22 \ -kernel ../linux/arch/arm64/boot/Image \ -append "root=/dev/vda2" And then running kvmtool: lkvm run -k Image -p earlycon Hopefully that is all that is needed to reproduce it. The following patch gets it booting again, but I'm sure it's not the right fix. diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 53749d3a0996..1a1abda6af74 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1587,6 +1587,14 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } +static void nv_reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + BUG_ON(!r->reg); + BUG_ON(r->reg >= NR_SYS_REGS); + if (vcpu_has_nv(vcpu)) + __vcpu_sys_reg(vcpu, r->reg) = r->val; +} + static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { @@ -2190,7 +2198,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0), EL2_REG(TPIDR_EL2, access_rw, reset_val, 0), - EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0), + EL2_REG(CNTVOFF_EL2, access_rw, nv_reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), EL12_REG(SCTLR, access_vm_reg, reset_val, 0x00C50078), Thanks, Joey [1] https://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git/commit/?h=next&id=96a4627dbbd48144a65af936b321701c70876026 [2] e17d182ad3f797f01947fc234d95c96c050c534b [3] 99d6b11b5b44d7dd64f4cb1973184e40a4a174f8