Hi Marc, I would like to use the S1 walker for KVM SPE, and I was planning to move it to a separate file, where it would be shared between nested KVM and SPE. I think this is also good for NV, since the walker would get more testing. Do you think moving it to a shared location is a good approach? Or do you have something else in mind? Also, do you know where you'll be able to send an updated version of this series? I'm asking because I want to decide between using this code (with fixes on top) or wait for the next iteration. Please don't feel that you need to send the next iteration too soon. And please CC me on the series, so I don't miss it by mistake :) Thanks, Alex On Mon, Jul 08, 2024 at 05:57:58PM +0100, Marc Zyngier wrote: > In order to plug the brokenness of our current AT implementation, > we need a SW walker that is going to... err.. walk the S1 tables > and tell us what it finds. > > Of course, it builds on top of our S2 walker, and share similar > concepts. The beauty of it is that since it uses kvm_read_guest(), > it is able to bring back pages that have been otherwise evicted. > > This is then plugged in the two AT S1 emulation functions as > a "slow path" fallback. I'm not sure it is that slow, but hey. > > Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx> > --- > arch/arm64/kvm/at.c | 538 ++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 520 insertions(+), 18 deletions(-) > > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c > index 71e3390b43b4c..8452273cbff6d 100644 > --- a/arch/arm64/kvm/at.c > +++ b/arch/arm64/kvm/at.c > @@ -4,9 +4,305 @@ > * Author: Jintack Lim <jintack.lim@xxxxxxxxxx> > */ > > +#include <linux/kvm_host.h> > + > +#include <asm/esr.h> > #include <asm/kvm_hyp.h> > #include <asm/kvm_mmu.h> > > +struct s1_walk_info { > + u64 baddr; > + unsigned int max_oa_bits; > + unsigned int pgshift; > + unsigned int txsz; > + int sl; > + bool hpd; > + bool be; > + bool nvhe; > + bool s2; > +}; > + > +struct s1_walk_result { > + union { > + struct { > + u64 desc; > + u64 pa; > + s8 level; > + u8 APTable; > + bool UXNTable; > + bool PXNTable; > + }; > + struct { > + u8 fst; > + bool ptw; > + bool s2; > + }; > + }; > + bool failed; > +}; > + > +static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool ptw, bool s2) > +{ > + wr->fst = fst; > + wr->ptw = ptw; > + wr->s2 = s2; > + wr->failed = true; > +} > + > +#define S1_MMU_DISABLED (-127) > + > +static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, > + struct s1_walk_result *wr, const u64 va, const int el) > +{ > + u64 sctlr, tcr, tg, ps, ia_bits, ttbr; > + unsigned int stride, x; > + bool va55, tbi; > + > + wi->nvhe = el == 2 && !vcpu_el2_e2h_is_set(vcpu); > + > + va55 = va & BIT(55); > + > + if (wi->nvhe && va55) > + goto addrsz; > + > + wi->s2 = el < 2 && (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_VM); > + > + switch (el) { > + case 1: > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); > + tcr = vcpu_read_sys_reg(vcpu, TCR_EL1); > + ttbr = (va55 ? > + vcpu_read_sys_reg(vcpu, TTBR1_EL1) : > + vcpu_read_sys_reg(vcpu, TTBR0_EL1)); > + break; > + case 2: > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); > + tcr = vcpu_read_sys_reg(vcpu, TCR_EL2); > + ttbr = (va55 ? > + vcpu_read_sys_reg(vcpu, TTBR1_EL2) : > + vcpu_read_sys_reg(vcpu, TTBR0_EL2)); > + break; > + default: > + BUG(); > + } > + > + /* Let's put the MMU disabled case aside immediately */ > + if (!(sctlr & SCTLR_ELx_M) || > + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { > + if (va >= BIT(kvm_get_pa_bits(vcpu->kvm))) > + goto addrsz; > + > + wr->level = S1_MMU_DISABLED; > + wr->desc = va; > + return 0; > + } > + > + wi->be = sctlr & SCTLR_ELx_EE; > + > + wi->hpd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HPDS, IMP); > + wi->hpd &= (wi->nvhe ? > + FIELD_GET(TCR_EL2_HPD, tcr) : > + (va55 ? > + FIELD_GET(TCR_HPD1, tcr) : > + FIELD_GET(TCR_HPD0, tcr))); > + > + tbi = (wi->nvhe ? > + FIELD_GET(TCR_EL2_TBI, tcr) : > + (va55 ? > + FIELD_GET(TCR_TBI1, tcr) : > + FIELD_GET(TCR_TBI0, tcr))); > + > + if (!tbi && sign_extend64(va, 55) != (s64)va) > + goto addrsz; > + > + /* Someone was silly enough to encode TG0/TG1 differently */ > + if (va55) { > + wi->txsz = FIELD_GET(TCR_T1SZ_MASK, tcr); > + tg = FIELD_GET(TCR_TG1_MASK, tcr); > + > + switch (tg << TCR_TG1_SHIFT) { > + case TCR_TG1_4K: > + wi->pgshift = 12; break; > + case TCR_TG1_16K: > + wi->pgshift = 14; break; > + case TCR_TG1_64K: > + default: /* IMPDEF: treat any other value as 64k */ > + wi->pgshift = 16; break; > + } > + } else { > + wi->txsz = FIELD_GET(TCR_T0SZ_MASK, tcr); > + tg = FIELD_GET(TCR_TG0_MASK, tcr); > + > + switch (tg << TCR_TG0_SHIFT) { > + case TCR_TG0_4K: > + wi->pgshift = 12; break; > + case TCR_TG0_16K: > + wi->pgshift = 14; break; > + case TCR_TG0_64K: > + default: /* IMPDEF: treat any other value as 64k */ > + wi->pgshift = 16; break; > + } > + } > + > + ia_bits = 64 - wi->txsz; > + > + /* AArch64.S1StartLevel() */ > + stride = wi->pgshift - 3; > + wi->sl = 3 - (((ia_bits - 1) - wi->pgshift) / stride); > + > + /* Check for SL mandating LPA2 (which we don't support yet) */ > + switch (BIT(wi->pgshift)) { > + case SZ_4K: > + if (wi->sl == -1 && > + !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)) > + goto addrsz; > + break; > + case SZ_16K: > + if (wi->sl == 0 && > + !kvm_has_feat(vcpu->kvm, ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)) > + goto addrsz; > + break; > + } > + > + ps = (wi->nvhe ? > + FIELD_GET(TCR_EL2_PS_MASK, tcr) : FIELD_GET(TCR_IPS_MASK, tcr)); > + > + wi->max_oa_bits = min(get_kvm_ipa_limit(), ps_to_output_size(ps)); > + > + /* Compute minimal alignment */ > + x = 3 + ia_bits - ((3 - wi->sl) * stride + wi->pgshift); > + > + wi->baddr = ttbr & TTBRx_EL1_BADDR; > + wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x); > + > + return 0; > + > +addrsz: /* Address Size Fault level 0 */ > + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ, false, false); > + > + return -EFAULT; > +} > + > +static int get_ia_size(struct s1_walk_info *wi) > +{ > + return 64 - wi->txsz; > +} > + > +static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi, > + struct s1_walk_result *wr, u64 va) > +{ > + u64 va_top, va_bottom, baddr, desc; > + int level, stride, ret; > + > + level = wi->sl; > + stride = wi->pgshift - 3; > + baddr = wi->baddr; > + > + va_top = get_ia_size(wi) - 1; > + > + while (1) { > + u64 index, ipa; > + > + va_bottom = (3 - level) * stride + wi->pgshift; > + index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3); > + > + ipa = baddr | index; > + > + if (wi->s2) { > + struct kvm_s2_trans s2_trans = {}; > + > + ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans); > + if (ret) { > + fail_s1_walk(wr, > + (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level, > + true, true); > + return ret; > + } > + > + if (!kvm_s2_trans_readable(&s2_trans)) { > + fail_s1_walk(wr, ESR_ELx_FSC_PERM | level, > + true, true); > + > + return -EPERM; > + } > + > + ipa = kvm_s2_trans_output(&s2_trans); > + } > + > + ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc)); > + if (ret) { > + fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), > + true, false); > + return ret; > + } > + > + if (wi->be) > + desc = be64_to_cpu((__force __be64)desc); > + else > + desc = le64_to_cpu((__force __le64)desc); > + > + if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) { > + fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level, > + true, false); > + return -ENOENT; > + } > + > + /* We found a leaf, handle that */ > + if ((desc & 3) == 1 || level == 3) > + break; > + > + if (!wi->hpd) { > + wr->APTable |= FIELD_GET(PMD_TABLE_AP, desc); > + wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc); > + wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc); > + } > + > + baddr = GENMASK_ULL(47, wi->pgshift); > + > + /* Check for out-of-range OA */ > + if (wi->max_oa_bits < 48 && > + (baddr & GENMASK_ULL(47, wi->max_oa_bits))) { > + fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ | level, > + true, false); > + return -EINVAL; > + } > + > + /* Prepare for next round */ > + va_top = va_bottom - 1; > + level++; > + } > + > + /* Block mapping, check the validity of the level */ > + if (!(desc & BIT(1))) { > + bool valid_block = false; > + > + switch (BIT(wi->pgshift)) { > + case SZ_4K: > + valid_block = level == 1 || level == 2; > + break; > + case SZ_16K: > + case SZ_64K: > + valid_block = level == 2; > + break; > + } > + > + if (!valid_block) { > + fail_s1_walk(wr, ESR_ELx_FSC_FAULT | level, > + true, false); > + return -EINVAL; > + } > + } > + > + wr->failed = false; > + wr->level = level; > + wr->desc = desc; > + wr->pa = desc & GENMASK(47, va_bottom); > + if (va_bottom > 12) > + wr->pa |= va & GENMASK_ULL(va_bottom - 1, 12); > + > + return 0; > +} > + > struct mmu_config { > u64 ttbr0; > u64 ttbr1; > @@ -234,6 +530,177 @@ static u64 compute_par_s12(struct kvm_vcpu *vcpu, u64 s1_par, > return par; > } > > +static u64 compute_par_s1(struct kvm_vcpu *vcpu, struct s1_walk_result *wr) > +{ > + u64 par; > + > + if (wr->failed) { > + par = SYS_PAR_EL1_RES1; > + par |= SYS_PAR_EL1_F; > + par |= FIELD_PREP(SYS_PAR_EL1_FST, wr->fst); > + par |= wr->ptw ? SYS_PAR_EL1_PTW : 0; > + par |= wr->s2 ? SYS_PAR_EL1_S : 0; > + } else if (wr->level == S1_MMU_DISABLED) { > + /* MMU off or HCR_EL2.DC == 1 */ > + par = wr->pa & GENMASK_ULL(47, 12); > + > + if (!(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_DC)) { > + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, 0); /* nGnRnE */ > + par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b10); /* OS */ > + } else { > + par |= FIELD_PREP(SYS_PAR_EL1_ATTR, > + MEMATTR(WbRaWa, WbRaWa)); > + par |= FIELD_PREP(SYS_PAR_EL1_SH, 0b00); /* NS */ > + } > + } else { > + u64 mair, sctlr; > + int el; > + u8 sh; > + > + el = (vcpu_el2_e2h_is_set(vcpu) && > + vcpu_el2_tge_is_set(vcpu)) ? 2 : 1; > + > + mair = ((el == 2) ? > + vcpu_read_sys_reg(vcpu, MAIR_EL2) : > + vcpu_read_sys_reg(vcpu, MAIR_EL1)); > + > + mair >>= FIELD_GET(PTE_ATTRINDX_MASK, wr->desc) * 8; > + mair &= 0xff; > + > + sctlr = ((el == 2) ? > + vcpu_read_sys_reg(vcpu, SCTLR_EL2) : > + vcpu_read_sys_reg(vcpu, SCTLR_EL1)); > + > + /* Force NC for memory if SCTLR_ELx.C is clear */ > + if (!(sctlr & SCTLR_EL1_C) && !MEMATTR_IS_DEVICE(mair)) > + mair = MEMATTR(NC, NC); > + > + par = FIELD_PREP(SYS_PAR_EL1_ATTR, mair); > + par |= wr->pa & GENMASK_ULL(47, 12); > + > + sh = compute_sh(mair, wr->desc); > + par |= FIELD_PREP(SYS_PAR_EL1_SH, sh); > + } > + > + return par; > +} > + > +static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > +{ > + bool perm_fail, ur, uw, ux, pr, pw, pan; > + struct s1_walk_result wr = {}; > + struct s1_walk_info wi = {}; > + int ret, idx, el; > + > + /* > + * We only get here from guest EL2, so the translation regime > + * AT applies to is solely defined by {E2H,TGE}. > + */ > + el = (vcpu_el2_e2h_is_set(vcpu) && > + vcpu_el2_tge_is_set(vcpu)) ? 2 : 1; > + > + ret = setup_s1_walk(vcpu, &wi, &wr, vaddr, el); > + if (ret) > + goto compute_par; > + > + if (wr.level == S1_MMU_DISABLED) > + goto compute_par; > + > + idx = srcu_read_lock(&vcpu->kvm->srcu); > + > + ret = walk_s1(vcpu, &wi, &wr, vaddr); > + > + srcu_read_unlock(&vcpu->kvm->srcu, idx); > + > + if (ret) > + goto compute_par; > + > + /* FIXME: revisit when adding indirect permission support */ > + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, PAN, PAN3) && > + !wi.nvhe) { > + u64 sctlr; > + > + if (el == 1) > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1); > + else > + sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL2); > + > + ux = (sctlr & SCTLR_EL1_EPAN) && !(wr.desc & PTE_UXN); > + } else { > + ux = false; > + } > + > + pw = !(wr.desc & PTE_RDONLY); > + > + if (wi.nvhe) { > + ur = uw = false; > + pr = true; > + } else { > + if (wr.desc & PTE_USER) { > + ur = pr = true; > + uw = pw; > + } else { > + ur = uw = false; > + pr = true; > + } > + } > + > + /* Apply the Hierarchical Permission madness */ > + if (wi.nvhe) { > + wr.APTable &= BIT(1); > + wr.PXNTable = wr.UXNTable; > + } > + > + ur &= !(wr.APTable & BIT(0)); > + uw &= !(wr.APTable != 0); > + ux &= !wr.UXNTable; > + > + pw &= !(wr.APTable & BIT(1)); > + > + pan = *vcpu_cpsr(vcpu) & PSR_PAN_BIT; > + > + perm_fail = false; > + > + switch (op) { > + case OP_AT_S1E1RP: > + perm_fail |= pan && (ur || uw || ux); > + fallthrough; > + case OP_AT_S1E1R: > + case OP_AT_S1E2R: > + perm_fail |= !pr; > + break; > + case OP_AT_S1E1WP: > + perm_fail |= pan && (ur || uw || ux); > + fallthrough; > + case OP_AT_S1E1W: > + case OP_AT_S1E2W: > + perm_fail |= !pw; > + break; > + case OP_AT_S1E0R: > + perm_fail |= !ur; > + break; > + case OP_AT_S1E0W: > + perm_fail |= !uw; > + break; > + default: > + BUG(); > + } > + > + if (perm_fail) { > + struct s1_walk_result tmp; > + > + tmp.failed = true; > + tmp.fst = ESR_ELx_FSC_PERM | wr.level; > + tmp.s2 = false; > + tmp.ptw = false; > + > + wr = tmp; > + } > + > +compute_par: > + return compute_par_s1(vcpu, &wr); > +} > + > static bool check_at_pan(struct kvm_vcpu *vcpu, u64 vaddr, u64 *res) > { > u64 par_e0; > @@ -266,9 +733,11 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > struct mmu_config config; > struct kvm_s2_mmu *mmu; > unsigned long flags; > - bool fail; > + bool fail, retry_slow; > u64 par; > > + retry_slow = false; > + > write_lock(&vcpu->kvm->mmu_lock); > > /* > @@ -288,14 +757,15 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > goto skip_mmu_switch; > > /* > - * FIXME: Obtaining the S2 MMU for a L2 is horribly racy, and > - * we may not find it (recycled by another vcpu, for example). > - * See the other FIXME comment below about the need for a SW > - * PTW in this case. > + * Obtaining the S2 MMU for a L2 is horribly racy, and we may not > + * find it (recycled by another vcpu, for example). When this > + * happens, use the SW (slow) path. > */ > mmu = lookup_s2_mmu(vcpu); > - if (WARN_ON(!mmu)) > + if (!mmu) { > + retry_slow = true; > goto out; > + } > > write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR0_EL1), SYS_TTBR0); > write_sysreg_el1(vcpu_read_sys_reg(vcpu, TTBR1_EL1), SYS_TTBR1); > @@ -331,18 +801,17 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > } > > if (!fail) > - par = read_sysreg(par_el1); > + par = read_sysreg_par(); > else > par = SYS_PAR_EL1_F; > > + retry_slow = !fail; > + > vcpu_write_sys_reg(vcpu, par, PAR_EL1); > > /* > - * Failed? let's leave the building now. > - * > - * FIXME: how about a failed translation because the shadow S2 > - * wasn't populated? We may need to perform a SW PTW, > - * populating our shadow S2 and retry the instruction. > + * Failed? let's leave the building now, unless we retry on > + * the slow path. > */ > if (par & SYS_PAR_EL1_F) > goto nopan; > @@ -354,29 +823,58 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > switch (op) { > case OP_AT_S1E1RP: > case OP_AT_S1E1WP: > + retry_slow = false; > fail = check_at_pan(vcpu, vaddr, &par); > break; > default: > goto nopan; > } > > + if (fail) { > + vcpu_write_sys_reg(vcpu, SYS_PAR_EL1_F, PAR_EL1); > + goto nopan; > + } > + > /* > * If the EL0 translation has succeeded, we need to pretend > * the AT operation has failed, as the PAN setting forbids > * such a translation. > - * > - * FIXME: we hardcode a Level-3 permission fault. We really > - * should return the real fault level. > */ > - if (fail || !(par & SYS_PAR_EL1_F)) > - vcpu_write_sys_reg(vcpu, (0xf << 1) | SYS_PAR_EL1_F, PAR_EL1); > - > + if (par & SYS_PAR_EL1_F) { > + u8 fst = FIELD_GET(SYS_PAR_EL1_FST, par); > + > + /* > + * If we get something other than a permission fault, we > + * need to retry, as we're likely to have missed in the PTs. > + */ > + if ((fst & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_PERM) > + retry_slow = true; > + } else { > + /* > + * The EL0 access succeded, but we don't have the full > + * syndrom information to synthetize the failure. Go slow. > + */ > + retry_slow = true; > + } > nopan: > __mmu_config_restore(&config); > out: > local_irq_restore(flags); > > write_unlock(&vcpu->kvm->mmu_lock); > + > + /* > + * If retry_slow is true, then we either are missing shadow S2 > + * entries, have paged out guest S1, or something is inconsistent. > + * > + * Either way, we need to walk the PTs by hand so that we can either > + * fault things back, in or record accurate fault information along > + * the way. > + */ > + if (retry_slow) { > + par = handle_at_slow(vcpu, op, vaddr); > + vcpu_write_sys_reg(vcpu, par, PAR_EL1); > + } > } > > void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > @@ -433,6 +931,10 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr) > > write_unlock(&vcpu->kvm->mmu_lock); > > + /* We failed the translation, let's replay it in slow motion */ > + if (!fail && (par & SYS_PAR_EL1_F)) > + par = handle_at_slow(vcpu, op, vaddr); > + > vcpu_write_sys_reg(vcpu, par, PAR_EL1); > } > > -- > 2.39.2 > >