Hi Marc, On Tue, Aug 13, 2024 at 11:05:36AM +0100, Marc Zyngier wrote: > In order to plug the brokenness of our current AT implementation, > we need a SW walker that is going to... err.. walk the S1 tables > and tell us what it finds. > > Of course, it builds on top of our S2 walker, and share similar > concepts. The beauty of it is that since it uses kvm_read_guest(), > it is able to bring back pages that have been otherwise evicted. > > This is then plugged in the two AT S1 emulation functions as > a "slow path" fallback. I'm not sure it is that slow, but hey. > > Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx> > --- > arch/arm64/kvm/at.c | 607 +++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 605 insertions(+), 2 deletions(-) > > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c > index 9865d29b3149..6d5555e98557 100644 > --- a/arch/arm64/kvm/at.c > +++ b/arch/arm64/kvm/at.c > @@ -4,9 +4,405 @@ > * Author: Jintack Lim <jintack.lim@xxxxxxxxxx> > */ > > +#include <linux/kvm_host.h> > + > +#include <asm/esr.h> > #include <asm/kvm_hyp.h> > #include <asm/kvm_mmu.h> > > +enum trans_regime { > + TR_EL10, > + TR_EL20, > + TR_EL2, > +}; > + > +struct s1_walk_info { > + u64 baddr; > + enum trans_regime regime; > + unsigned int max_oa_bits; > + unsigned int pgshift; > + unsigned int txsz; > + int sl; > + bool hpd; > + bool be; > + bool s2; > +}; > + > +struct s1_walk_result { > + union { > + struct { > + u64 desc; > + u64 pa; > + s8 level; > + u8 APTable; > + bool UXNTable; > + bool PXNTable; > + }; > + struct { > + u8 fst; > + bool ptw; > + bool s2; > + }; > + }; > + bool failed; > +}; > + > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) > +{ > + wr->fst = fst; > + wr->ptw = ptw; > + wr->s2 = s2; > + wr->failed = true; > +} > + > +#define S1_MMU_DISABLED (-127) > + > +static int get_ia_size(struct s1_walk_info *wi) > +{ > + return 64 - wi->txsz; > +} > + > +/* Return true if the IPA is out of the OA range */ > +static bool check_output_size(u64 ipa, struct s1_walk_info *wi) > +{ > + return wi->max_oa_bits < 48 && (ipa & GENMASK_ULL(47, wi->max_oa_bits)); > +} > + > +/* Return the translation regime that applies to an AT instruction */ > +static enum trans_regime compute_translation_regime(struct kvm_vcpu *vcpu, u32 op) > +{ > + /* > + * We only get here from guest EL2, so the translation > + * regime AT applies to is solely defined by {E2H,TGE}. > + */ > + switch (op) { > + case OP_AT_S1E2R: > + case OP_AT_S1E2W: > + return vcpu_el2_e2h_is_set(vcpu) ? TR_EL20 : TR_EL2; > + break; > + default: > + return (vcpu_el2_e2h_is_set(vcpu) && > + vcpu_el2_tge_is_set(vcpu)) ? TR_EL20 : TR_EL10; > + } > +} > + > +static int setup_s1_walk(struct kvm_vcpu *vcpu, u32 op, struct s1_walk_info *wi, > + struct s1_walk_result *wr, u64 va) > +{ > + u64 sctlr, tcr, tg, ps, ia_bits, ttbr; > + unsigned int stride, x; > + bool va55, tbi, lva, as_el0; > + > + wi->regime = compute_translation_regime(vcpu, op); > + as_el0 = (op == OP_AT_S1E0R || op == OP_AT_S1E0W); > + > + va55 = va & BIT(55); > + > + if (wi->regime == TR_EL2 && va55) > + goto addrsz; > + > + wi->s2 = (wi->regime == TR_EL10 && > + (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC))); This could be written on one line if there were a local variable for the HCR_EL2 register (which is already read multiple times in the function). > + > + switch (wi->regime) { > + case TR_EL10: > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); > + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); > + ttbr = (va55 ? > + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : > + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); > + break; > + case TR_EL2: > + case TR_EL20: > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); > + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); > + ttbr = (va55 ? > + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : > + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); > + break; > + default: > + BUG(); > + } > + > + tbi = (wi->regime == TR_EL2 ? > + FIELD_GET(TCR_EL2_TBI, tcr) : > + (va55 ? > + FIELD_GET(TCR_TBI1, tcr) : > + FIELD_GET(TCR_TBI0, tcr))); > + > + if (!tbi && (u64)sign_extend64(va, 55) != va) > + goto addrsz; > + > + va = (u64)sign_extend64(va, 55); > + > + /* Let's put the MMU disabled case aside immediately */ > + switch (wi->regime) { > + case TR_EL10: > + /* > + * If dealing with the EL1&0 translation regime, 3 things > + * can disable the S1 translation: > + * > + * - HCR_EL2.DC = 1 > + * - HCR_EL2.{E2H,TGE} = {0,1} > + * - SCTLR_EL1.M = 0 > + * > + * The TGE part is interesting. If we have decided that this > + * is EL1&0, then it means that either {E2H,TGE} == {1,0} or > + * {0,x}, and we only need to test for TGE == 1. > + */ > + if (__vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_DC | HCR_TGE)) > + wr->level = S1_MMU_DISABLED; There's no need to fallthrough and check SCTLR_ELx.M if the MMU disabled check here is true. > + fallthrough; > + case TR_EL2: > + case TR_EL20: > + if (!(sctlr & SCTLR_ELx_M)) > + wr->level = S1_MMU_DISABLED; > + break; > + } > + > + if (wr->level == S1_MMU_DISABLED) { > + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) > + goto addrsz; > + > + wr->pa = va; > + return 0; > + } > + > + wi->be = sctlr & SCTLR_ELx_EE; > + > + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); > + wi->hpd &= (wi->regime == TR_EL2 ? > + FIELD_GET(TCR_EL2_HPD, tcr) : > + (va55 ? > + FIELD_GET(TCR_HPD1, tcr) : > + FIELD_GET(TCR_HPD0, tcr))); > + > + /* Someone was silly enough to encode TG0/TG1 differently */ > + if (va55) { > + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); > + tg = FIELD_GET(TCR_TG1_MASK, tcr); > + > + switch (tg << TCR_TG1_SHIFT) { > + case TCR_TG1_4K: > + wi->pgshift = 12; break; > + case TCR_TG1_16K: > + wi->pgshift = 14; break; > + case TCR_TG1_64K: > + default: /* IMPDEF: treat any other value as 64k */ > + wi->pgshift = 16; break; > + } Just a thought, wi->pgshift is used in several places to identify the guest page size, might be useful to have something like PAGE_SHIFT_{4K,16K,64K}. That would also make its usage consistent, because in some places wi->pgshift is compared directly to 12, 14 or 16, in other places the page size is computed from wi->pgshift and compared to SZ_4K, SZ_16K or SZ_64K. > + } else { > + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); > + tg = FIELD_GET(TCR_TG0_MASK, tcr); > + > + switch (tg << TCR_TG0_SHIFT) { > + case TCR_TG0_4K: > + wi->pgshift = 12; break; > + case TCR_TG0_16K: > + wi->pgshift = 14; break; > + case TCR_TG0_64K: > + default: /* IMPDEF: treat any other value as 64k */ > + wi->pgshift = 16; break; > + } > + } > + > + /* R_PLCGL, R_YXNYW */ > + if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR2_EL1, ST, 48_47)) { > + if (wi->txsz > 39) > + goto transfault_l0; > + } else { > + if (wi->txsz > 48 || (wi->pgshift == 16 && wi->txsz > 47)) > + goto transfault_l0; > + } > + > + /* R_GTJBY, R_SXWGM */ > + switch (BIT(wi->pgshift)) { > + case SZ_4K: > + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT); > + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); > + break; > + case SZ_16K: > + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT); > + lva &= tcr & (wi->regime == TR_EL2 ? TCR_EL2_DS : TCR_DS); > + break; > + case SZ_64K: > + lva = kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, VARange, 52); > + break; > + } > + > + if ((lva && wi->txsz < 12) || wi->txsz < 16) > + goto transfault_l0; Let's assume lva = true, wi->txsz greater than 12, but smaller than 16, which is architecturally allowed according to R_GTJBY and AArch64.S1MinTxSZ(). (lva && wi->txsz < 12) = false wi->txsz < 16 = true KVM treats it as a fault. > + > + ia_bits = get_ia_size(wi); > + > + /* R_YYVYV, I_THCZK */ > + if ((!va55 && va > GENMASK(ia_bits - 1, 0)) || > + (va55 && va < GENMASK(63, ia_bits))) > + goto transfault_l0; > + > + /* I_ZFSYQ */ > + if (wi->regime != TR_EL2 && > + (tcr & ((va55) ? TCR_EPD1_MASK : TCR_EPD0_MASK))) > + goto transfault_l0; Extra paranthesis around va55? > + > + /* R_BNDVG and following statements */ > + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR2_EL1, E0PD, IMP) && > + as_el0 && (tcr & ((va55) ? TCR_E0PD1 : TCR_E0PD0))) > + goto transfault_l0; Same here with the extra paranthesis around va55. > + > + /* AArch64.S1StartLevel() */ > + stride = wi->pgshift - 3; > + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); > + > + ps = (wi->regime == TR_EL2 ? > + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); > + > + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); > + > + /* Compute minimal alignment */ > + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); > + > + wi->baddr = ttbr & TTBRx_EL1_BADDR; > + > + /* R_VPBBF */ > + if (check_output_size(wi->baddr, wi)) > + goto transfault_l0; I think R_VPBBF says that an Address size fault is generated here, not a translation fault. > + > + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); > + > + return 0; > + > +addrsz: /* Address Size Fault level 0 */ > + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(0), false, false); > + return -EFAULT; > + > +transfault_l0: /* Translation Fault level 0 */ > + fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(0), false, false); > + return -EFAULT; > +} [..] > +static bool par_check_s1_perm_fault(u64 par) > +{ > + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); > + > + return ((fst & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM && > + !(par & SYS_PAR_EL1_S)); ESR_ELx_FSC_PERM = 0x0c is a permission fault, level 0, which Arm ARM says can only happen when FEAT_LPA2. I think the code should check that the value for PAR_EL1.FST is in the interval (ESR_ELx_FSC_PERM_L(0), ESR_ELx_FSC_PERM_L(3)]. With the remaining minor issues fixed: Reviewed-by: Alexandru Elisei <alexandru.elisei@xxxxxxx> Thanks, Alex > +} > + > void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > { > u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr); > > + /* > + * If PAR_EL1 reports that AT failed on a S1 permission fault, we > + * know for sure that the PTW was able to walk the S1 tables and > + * there's nothing else to do. > + * > + * If AT failed for any other reason, then we must walk the guest S1 > + * to emulate the instruction. > + */ > + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) > + par = handle_at_slow(vcpu, op, vaddr); > + > vcpu_write_sys_reg(vcpu, par, PAR_EL1); > } > > @@ -407,6 +1006,10 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > isb(); > } > > + /* We failed the translation, let's replay it in slow motion */ > + if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) > + par = handle_at_slow(vcpu, op, vaddr); > + > vcpu_write_sys_reg(vcpu, par, PAR_EL1); > } > > @@ -463,7 +1066,7 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > /* Check the access permission */ > if (!out.esr && > ((!write && !out.readable) || (write && !out.writable))) > - out.esr = ESR_ELx_FSC_PERM | (out.level & 0x3); > + out.esr = ESR_ELx_FSC_PERM_L(out.level & 0x3); > > par = compute_par_s12(vcpu, par, &out); > vcpu_write_sys_reg(vcpu, par, PAR_EL1); > -- > 2.39.2 >