On Thu, 18 Jul 2024 16:16:19 +0100, Alexandru Elisei <alexandru.elisei@xxxxxxx> wrote: > > Hi, > > Managed to have a look at AT handling for stage 1, I've been comparing it with > AArch64.AT(). > > On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote: > > In order to plug the brokenness of our current AT implementation, > > we need a SW walker that is going to... err.. walk the S1 tables > > and tell us what it finds. > > > > Of course, it builds on top of our S2 walker, and share similar > > concepts. The beauty of it is that since it uses kvm_read_guest(), > > it is able to bring back pages that have been otherwise evicted. > > > > This is then plugged in the two AT S1 emulation functions as > > a "slow path" fallback. I'm not sure it is that slow, but hey. > > > > Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx> > > --- > > arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++-- > > 1 file changed, 520 insertions(+), 18 deletions(-) > > > > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c > > index 71e3390b43b4c..8452273cbff6d 100644 > > --- a/arch/arm64/kvm/at.c > > +++ b/arch/arm64/kvm/at.c > > @@ -4,9 +4,305 @@ > > * Author: Jintack Lim <jintack.lim@xxxxxxxxxx> > > */ > > > > +#include <linux/kvm_host.h> > > + > > +#include <asm/esr.h> > > #include <asm/kvm_hyp.h> > > #include <asm/kvm_mmu.h> > > > > +struct s1_walk_info { > > + u64 baddr; > > + unsigned int max_oa_bits; > > + unsigned int pgshift; > > + unsigned int txsz; > > + int sl; > > + bool hpd; > > + bool be; > > + bool nvhe; > > + bool s2; > > +}; > > + > > +struct s1_walk_result { > > + union { > > + struct { > > + u64 desc; > > + u64 pa; > > + s8 level; > > + u8 APTable; > > + bool UXNTable; > > + bool PXNTable; > > + }; > > + struct { > > + u8 fst; > > + bool ptw; > > + bool s2; > > + }; > > + }; > > + bool failed; > > +}; > > + > > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) > > +{ > > + wr->fst = fst; > > + wr->ptw = ptw; > > + wr->s2 = s2; > > + wr->failed = true; > > +} > > + > > +#define S1_MMU_DISABLED (-127) > > + > > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, > > + struct s1_walk_result *wr, const u64 va, const int el) > > +{ > > + u64 sctlr, tcr, tg, ps, ia_bits, ttbr; > > + unsigned int stride, x; > > + bool va55, tbi; > > + > > + wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu); > > + > > + va55 = va & BIT(55); > > + > > + if (wi->nvhe && va55) > > + goto addrsz; > > + > > + wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM); > > + > > + switch (el) { > > + case 1: > > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); > > + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); > > + ttbr = (va55 ? > > + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : > > + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); > > + break; > > + case 2: > > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); > > + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); > > + ttbr = (va55 ? > > + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : > > + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); > > + break; > > + default: > > + BUG(); > > + } > > + > > + /* Let's put the MMU disabled case aside immediately */ > > + if (!(sctlr & SCTLR_ELx_M) || > > + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { > > + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) > > + goto addrsz; > > + > > + wr->level = S1_MMU_DISABLED; > > + wr->desc = va; > > + return 0; > > + } > > + > > + wi->be = sctlr & SCTLR_ELx_EE; > > + > > + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); > > + wi->hpd &= (wi->nvhe ? > > + FIELD_GET(TCR_EL2_HPD, tcr) : > > + (va55 ? > > + FIELD_GET(TCR_HPD1, tcr) : > > + FIELD_GET(TCR_HPD0, tcr))); > > + > > + tbi = (wi->nvhe ? > > + FIELD_GET(TCR_EL2_TBI, tcr) : > > + (va55 ? > > + FIELD_GET(TCR_TBI1, tcr) : > > + FIELD_GET(TCR_TBI0, tcr))); > > + > > + if (!tbi && sign_extend64(va, 55) != (s64)va) > > + goto addrsz; > > + > > + /* Someone was silly enough to encode TG0/TG1 differently */ > > + if (va55) { > > + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); > > + tg = FIELD_GET(TCR_TG1_MASK, tcr); > > + > > + switch (tg << TCR_TG1_SHIFT) { > > + case TCR_TG1_4K: > > + wi->pgshift = 12; break; > > + case TCR_TG1_16K: > > + wi->pgshift = 14; break; > > + case TCR_TG1_64K: > > + default: /* IMPDEF: treat any other value as 64k */ > > + wi->pgshift = 16; break; > > + } > > + } else { > > + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); > > + tg = FIELD_GET(TCR_TG0_MASK, tcr); > > + > > + switch (tg << TCR_TG0_SHIFT) { > > + case TCR_TG0_4K: > > + wi->pgshift = 12; break; > > + case TCR_TG0_16K: > > + wi->pgshift = 14; break; > > + case TCR_TG0_64K: > > + default: /* IMPDEF: treat any other value as 64k */ > > + wi->pgshift = 16; break; > > + } > > + } > > + > > + ia_bits = 64 - wi->txsz; > > get_ia_size()? yeah, fair enough. I wasn't sold on using any helper while the walk_info struct is incomplete, but that doesn't change much. > > > + > > + /* AArch64.S1StartLevel() */ > > + stride = wi->pgshift - 3; > > + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); > > + > > + /* Check for SL mandating LPA2 (which we don't support yet) */ > > + switch (BIT(wi->pgshift)) { > > + case SZ_4K: > > + if (wi->sl == -1 && > > + !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) > > + goto addrsz; > > + break; > > + case SZ_16K: > > + if (wi->sl == 0 && > > + !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) > > + goto addrsz; > > + break; > > + } > > + > > + ps = (wi->nvhe ? > > + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); > > + > > + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); > > + > > + /* Compute minimal alignment */ > > + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); > > + > > + wi->baddr = ttbr & TTBRx_EL1_BADDR; > > + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); > > + > > + return 0; > > + > > +addrsz: /* Address Size Fault level 0 */ > > + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ, false, false); > > + > > + return -EFAULT; > > +} > > The function seems to be missing checks for: > > - valid TxSZ > - VA is not larger than the maximum input size, as defined by TxSZ > - EPD{0,1} Yup, all fixed now, with E0PD{0,1} as an added bonus. The number of ways a translation can fail is amazing. > > > + > > +static int get_ia_size(struct s1_walk_info *wi) > > +{ > > + return 64 - wi->txsz; > > +} > > This looks a lot like get_ia_size() from nested.c. Indeed. Except that the *type* is different. And I really like the fact that they are separate for now. I may end-up merging some of the attributes at some point though. > > > + > > +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, > > + struct s1_walk_result *wr, u64 va) > > +{ > > + u64 va_top, va_bottom, baddr, desc; > > + int level, stride, ret; > > + > > + level = wi->sl; > > + stride = wi->pgshift - 3; > > + baddr = wi->baddr; > > AArch64.S1Walk() also checks that baddr is not larger than the OA size. > check_output_size() from nested.c looks almost like what you want > here. At this stage, it is too late, as wi->baddr has already been sanitised by the setup phase. I'll add a check over there. > > > + > > + va_top = get_ia_size(wi) - 1; > > + > > + while (1) { > > + u64 index, ipa; > > + > > + va_bottom = (3 - level) * stride + wi->pgshift; > > + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); > > + > > + ipa = baddr | index; > > + > > + if (wi->s2) { > > + struct kvm_s2_trans s2_trans = {}; > > + > > + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); > > + if (ret) { > > + fail_s1_walk(wr, > > + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, > > + true, true); > > + return ret; > > + } > > + > > + if (!kvm_s2_trans_readable(&s2_trans)) { > > + fail_s1_walk(wr, ESR_ELx_FSC_PERM | level, > > + true, true); > > + > > + return -EPERM; > > + } > > + > > + ipa = kvm_s2_trans_output(&s2_trans); > > + } > > + > > + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); > > + if (ret) { > > + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), > > + true, false); > > + return ret; > > + } > > + > > + if (wi->be) > > + desc = be64_to_cpu((__force __be64)desc); > > + else > > + desc = le64_to_cpu((__force __le64)desc); > > + > > + if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { > > + fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level, > > + true, false); > > + return -ENOENT; > > + } > > + > > + /* We found a leaf, handle that */ > > + if ((desc & 3) == 1 || level == 3) > > + break; > > + > > + if (!wi->hpd) { > > + wr->APTable |= FIELD_GET(PMD_TABLE_AP, desc); > > + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); > > + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); > > + } > > + > > + baddr = GENMASK_ULL(47, wi->pgshift); > > Where is baddr updated with the value read from the descriptor? Am I missing > something obvious here? Huh. Something has gone very wrong, and I have no idea how. This should read: baddr = desc & GENMASK_ULL(47, wi->pgshift); because otherwise nothing makes sense. I must have done a last minute cleanup and somehow broken it. Time to retest everything! > > > + > > + /* Check for out-of-range OA */ > > + if (wi->max_oa_bits < 48 && > > + (baddr & GENMASK_ULL(47, wi->max_oa_bits))) { > > + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ | level, > > + true, false); > > + return -EINVAL; > > + } > > This looks very much like check_output_size() from nested.c. Yup. I'll fold that into a helper -- still separate from the S2 version though. > > > + > > + /* Prepare for next round */ > > + va_top = va_bottom - 1; > > + level++; > > + } > > + > > + /* Block mapping, check the validity of the level */ > > + if (!(desc & BIT(1))) { > > + bool valid_block = false; > > + > > + switch (BIT(wi->pgshift)) { > > + case SZ_4K: > > + valid_block = level == 1 || level == 2; > > + break; > > + case SZ_16K: > > + case SZ_64K: > > + valid_block = level == 2; > > + break; > > + } > > + > > + if (!valid_block) { > > + fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level, > > + true, false); > > + return -EINVAL; > > + } > > + } > > Matches AArch64.BlockDescSupported(), with the caveat that the walker currently > doesn't support 52 bit PAs. > > > + > > + wr->failed = false; > > + wr->level = level; > > + wr->desc = desc; > > + wr->pa = desc & GENMASK(47, va_bottom); > > No output size check for final PA. Now fixed. > > > + if (va_bottom > 12) > > + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12); > > + > > + return 0; > > +} > > + > > struct mmu_config { > > u64 ttbr0; > > u64 ttbr1; > > @@ -234,6 +530,177 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, > > return par; > > } > > > > +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr) > > +{ > > + u64 par; > > + > > + if (wr->failed) { > > + par = SYS_PAR_EL1_RES1; > > + par |= SYS_PAR_EL1_F; > > + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); > > + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; > > + par |= wr->s2 ? SYS_PAR_EL1_S : 0; > > + } else if (wr->level == S1_MMU_DISABLED) { > > + /* MMU off or HCR_EL2.DC == 1 */ > > + par = wr->pa & GENMASK_ULL(47, 12); > > That's interesting, setup_s1_walk() sets wr->desc = va and leaves wr->pa > unchanged (it's 0 from initialization in handle_at_slow()). If by "interesting" you mean broken, then I agree! > > > + > > + if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { > > + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ > > + par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */ > > + } else { > > + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, > > + MEMATTR(WbRaWa, WbRaWa)); > > + par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */ > > + } > > This matches AArch64.S1DisabledOutput(). > > > + } else { > > + u64 mair, sctlr; > > + int el; > > + u8 sh; > > + > > + el = (vcpu_el2_e2h_is_set(vcpu) && > > + vcpu_el2_tge_is_set(vcpu)) ? 2 : 1; > > + > > + mair = ((el == 2) ? > > + vcpu_read_sys_reg(vcpu, MAIR_EL2) : > > + vcpu_read_sys_reg(vcpu, MAIR_EL1)); > > + > > + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; > > + mair &= 0xff; > > + > > + sctlr = ((el == 2) ? > > + vcpu_read_sys_reg(vcpu, SCTLR_EL2) : > > + vcpu_read_sys_reg(vcpu, SCTLR_EL1)); > > + > > + /* Force NC for memory if SCTLR_ELx.C is clear */ > > + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) > > + mair = MEMATTR(NC, NC); > > This matches the compute memory attributes part of AArch64.S1Translate(). > > > + > > + par = FIELD_PREP(SYS_PAR_EL1_ATTR, mair); > > + par |= wr->pa & GENMASK_ULL(47, 12); > > + > > + sh = compute_sh(mair, wr->desc); > > + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); > > + } > > + > > + return par; > > +} > > + > > +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > > +{ > > + bool perm_fail, ur, uw, ux, pr, pw, pan; > > + struct s1_walk_result wr = {}; > > + struct s1_walk_info wi = {}; > > + int ret, idx, el; > > + > > + /* > > + * We only get here from guest EL2, so the translation regime > > + * AT applies to is solely defined by {E2H,TGE}. > > + */ > > + el = (vcpu_el2_e2h_is_set(vcpu) && > > + vcpu_el2_tge_is_set(vcpu)) ? 2 : 1; > > + > > + ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el); > > + if (ret) > > + goto compute_par; > > + > > + if (wr.level == S1_MMU_DISABLED) > > + goto compute_par; > > + > > + idx = srcu_read_lock(&vcpu->kvm->srcu); > > + > > + ret = walk_s1(vcpu, &wi, &wr, vaddr); > > + > > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > > + > > + if (ret) > > + goto compute_par; > > + > > + /* FIXME: revisit when adding indirect permission support */ > > + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) && > > + !wi.nvhe) { > > Just FYI, the 'if' statement fits on one line without going over the old 80 > character limit. All that code has now been reworked. > > > + u64 sctlr; > > + > > + if (el == 1) > > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); > > + else > > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); > > + > > + ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN); > > I don't understand this. UnprivExecute is true for the memory location if and > only if **SCTLR_ELx.EPAN** && !UXN? Well, it is the only case we actually care about. And from what i read below, you *have* understood it. > > > + } else { > > + ux = false; > > + } > > + > > + pw = !(wr.desc & PTE_RDONLY); > > + > > + if (wi.nvhe) { > > + ur = uw = false; > > + pr = true; > > + } else { > > + if (wr.desc & PTE_USER) { > > + ur = pr = true; > > + uw = pw; > > + } else { > > + ur = uw = false; > > + pr = true; > > + } > > + } > > + > > + /* Apply the Hierarchical Permission madness */ > > + if (wi.nvhe) { > > + wr.APTable &= BIT(1); > > + wr.PXNTable = wr.UXNTable; > > + } > > + > > + ur &= !(wr.APTable & BIT(0)); > > + uw &= !(wr.APTable != 0); > > + ux &= !wr.UXNTable; > > + > > + pw &= !(wr.APTable & BIT(1)); > > Would it make sense here to compute the resulting permisisons like in > AArch64.S1DirectBasePermissions()? I.e, look at the AP bits first, have > a switch statement for all 4 values (also makes it very easy to cross-reference > with Table D8-60), then apply hierarchical permissions/pan/epan. I do admit > that I have a very selfish reason to propose this - it makes reviewing easier. > Fair enough. I usually try to distance myself from the pseudocode and implement what I understand, but I appreciate this is just hard to read. It definitely results in something larger, but it probably doesn't matter much. > > + > > + pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; > > + > > + perm_fail = false; > > + > > + switch (op) { > > + case OP_AT_S1E1RP: > > + perm_fail |= pan && (ur || uw || ux); > > I had a very hard time understanding what the code is trying to do here. How > about rewriting it to something like the pseudocode below: > > // ux = !(desc and UXN) and !UXNTable > perm_fail |= pan && (ur || uw || ((sctlr & SCTLR_EL1_EPAN) && ux)); > > ... which maps more closely to AArch64.S1DirectBasePermissions(). Yup, I got there by virtue of adopting the same flow as the pseudocode. Thanks a lot for the thorough review, much appreciated. M. -- Without deviation from the norm, progress is not possible.