On Fri, 15 Oct 2021 17:14:15 +0100, James Morse <james.morse@xxxxxxx> wrote: > > There are two conflicting use-cases for PBHA at stage2. We could copy > the stage1 PBHA bits to stage2, this would ensure the VMMs memory is > exactly reproduced for the guest, including the PBHA bits. The problem > here is how the VMM's memory is allocated with the PBHA bits set. > > The other is allowing the guest to configure PBHA directly. This would > allow guest device drivers to map memory with the appropriate PBHA bits. > This would only be safe if the guest can be trusted to only generate > PBHA values that only affect performance. > > The arm-arm doesn't describe how the stage1 and stage2 bits are combined. > Arm's implementations appear to all have the same behaviour, according > to the TRM: stage2 wins. > > For these CPUs, we can allow a guest to use a PBHA bit by disabling it > in VTCR_EL2. We just need to know which bits... > > The DT describes the values that only affect performance, but if value-5 > is safe for use, we can't prevent the guest from using value-1 and value-4. > These 'decomposed' values would also need to be listed as only affecting > performance. > > Add a cpufeature for CPUs that have this 'stage2 wins' behaviour. > Decompose each performance-only value (5 -> 5, 4, 1), and check each of > these values is listed as only affecting performance. If so, the bits > of the original value (5) can be used by the guest at stage1. (by clearing > the bits from VTCR_EL2) > > Signed-off-by: James Morse <james.morse@xxxxxxx> > --- > I've checked the TRMs for the listed CPUs. > There are more, I've asked for the TRMs to always describe this. > --- > arch/arm64/include/asm/cpufeature.h | 1 + > arch/arm64/include/asm/cputype.h | 4 ++ > arch/arm64/kernel/cpufeature.c | 105 ++++++++++++++++++++++++++++ > arch/arm64/kernel/image-vars.h | 3 + > arch/arm64/kvm/hyp/pgtable.c | 8 ++- > arch/arm64/tools/cpucaps | 1 + > 6 files changed, 120 insertions(+), 2 deletions(-) > > diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h > index ef6be92b1921..ec800ce15308 100644 > --- a/arch/arm64/include/asm/cpufeature.h > +++ b/arch/arm64/include/asm/cpufeature.h > @@ -101,6 +101,7 @@ struct arm64_ftr_reg { > }; > > extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0; > +extern unsigned long arm64_pbha_stage2_safe_bits; > > /* > * CPU capabilities: > diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h > index 6231e1f0abe7..4d7f18749d23 100644 > --- a/arch/arm64/include/asm/cputype.h > +++ b/arch/arm64/include/asm/cputype.h > @@ -73,6 +73,8 @@ > #define ARM_CPU_PART_CORTEX_A76 0xD0B > #define ARM_CPU_PART_NEOVERSE_N1 0xD0C > #define ARM_CPU_PART_CORTEX_A77 0xD0D > +#define ARM_CPU_PART_CORTEX_A78 0xD41 > +#define ARM_CPU_PART_CORTEX_X1 0xD44 > > #define APM_CPU_PART_POTENZA 0x000 > > @@ -113,6 +115,8 @@ > #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) > #define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) > #define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77) > +#define MIDR_CORTEX_A78 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78) > +#define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) > #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) > #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) > #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) > diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c > index 88f0f805b938..ad2b3b179ab1 100644 > --- a/arch/arm64/kernel/cpufeature.c > +++ b/arch/arm64/kernel/cpufeature.c > @@ -77,6 +77,7 @@ > #include <asm/cpu.h> > #include <asm/cpufeature.h> > #include <asm/cpu_ops.h> > +#include <asm/cputype.h> > #include <asm/fpsimd.h> > #include <asm/insn.h> > #include <asm/kvm_host.h> > @@ -113,6 +114,7 @@ EXPORT_SYMBOL(arm64_use_ng_mappings); > > unsigned long __ro_after_init arm64_pbha_perf_only_values; > EXPORT_SYMBOL(arm64_pbha_perf_only_values); > +unsigned long __ro_after_init arm64_pbha_stage2_safe_bits; > > /* > * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs > @@ -1680,13 +1682,50 @@ static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap, > > #endif > > + > #ifdef CONFIG_ARM64_PBHA > static u8 pbha_stage1_enable_bits; > +static DEFINE_SPINLOCK(pbha_dt_lock); > + > +/* For the value 5, return a bitmap with bits 5, 4, and 1 set. */ > +static unsigned long decompose_pbha_values(u8 val) > +{ > + int i; > + unsigned long mask = 0; > + > + for (i = 1; i <= 15; i++) { > + if ((i & val) == i) > + set_bit(i, &mask); > + } > + > + return mask; > +} > + > +/* > + * The bits of a value are safe if all values that can be built from those > + * enabled bits are listed as only affecting performance. > + * e.g. 5 would also need 1 and 4 to be listed. > + * > + * When there is a conflict with the bits already enabled, the new value is > + * skipped. > + * e.g. if 5 already caused bit-0 and bit-2 to be enabled, adding 3 to the list > + * would need to test 7 as bit-2 is already enabled. If 7 is not listed, 3 is > + * skipped and bit-1 is not enabled. > + */ > +static void stage2_test_pbha_value(u8 val) > +{ > + unsigned long mask; > + > + mask = decompose_pbha_values(val | arm64_pbha_stage2_safe_bits); > + if ((arm64_pbha_perf_only_values & mask) == mask) > + arm64_pbha_stage2_safe_bits |= val; > +} > > static bool plat_can_use_pbha_stage1(const struct arm64_cpu_capabilities *cap, > int scope) > { > u8 val; > + static bool dt_check_done; > struct device_node *cpus; > const u8 *perf_only_vals; > int num_perf_only_vals, i; > @@ -1702,6 +1741,10 @@ static bool plat_can_use_pbha_stage1(const struct arm64_cpu_capabilities *cap, > if (scope == SCOPE_LOCAL_CPU) > return true; > > + spin_lock(&pbha_dt_lock); > + if (dt_check_done) > + goto out_unlock; > + > cpus = of_find_node_by_path("/cpus"); > if (!cpus) > goto done; > @@ -1721,9 +1764,24 @@ static bool plat_can_use_pbha_stage1(const struct arm64_cpu_capabilities *cap, > set_bit(val, &arm64_pbha_perf_only_values); > } > > + /* > + * for stage2 the values are collapsed back to 4 bits that can only > + * enable values in the arm64_pbha_perf_only_values mask. > + */ > + for (i = 0 ; i < num_perf_only_vals; i++) { > + val = perf_only_vals[i]; > + if (val > 0xf) > + continue; > + > + stage2_test_pbha_value(val); > + } > + > done: > of_node_put(cpus); > + dt_check_done = true; > > +out_unlock: > + spin_unlock(&pbha_dt_lock); > return !!pbha_stage1_enable_bits; > } > > @@ -1743,6 +1801,47 @@ static void cpu_enable_pbha(struct arm64_cpu_capabilities const *cap) > isb(); > local_flush_tlb_all(); > } > + > +/* > + * PBHA's behaviour is implementation defined, as is the way it combines > + * stage1 and stage2 attributes. If the kernel has KVM supported, and booted > + * at EL2, only these CPUs can allow PBHA in a guest, as KVM knows how the PBHA > + * bits are combined. This prevents the host being affected by some > + * implementation defined behaviour from the guest. > + * > + * The TRM for these CPUs describe stage2 as overriding stage1. > + */ > +static const struct midr_range pbha_stage2_wins[] = { > + MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), > + MIDR_ALL_VERSIONS(MIDR_CORTEX_A76), > + MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), > + MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), > + {}, > +}; > + > +static bool plat_can_use_pbha_stage2(const struct arm64_cpu_capabilities *cap, > + int scope) > +{ > + /* Booted at EL2? */ > + if (!is_hyp_mode_available() && !is_kernel_in_hyp_mode()) > + return false; > + > + if (!is_midr_in_range_list(read_cpuid_id(), cap->midr_range_list)) > + return false; > + > + /* > + * Calls with scope == SCOPE_LOCAL_CPU need only testing whether this > + * cpu has the feature. A later 'system' scope call will check for a > + * firmware description. > + */ > + if (scope == SCOPE_LOCAL_CPU) > + return true; > + > + if (!__system_matches_cap(ARM64_HAS_PBHA_STAGE1)) > + return false; > + > + return !!arm64_pbha_stage2_safe_bits; > +} > #endif /* CONFIG_ARM64_PBHA */ > > #ifdef CONFIG_ARM64_AMU_EXTN > @@ -2418,6 +2517,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { > .min_field_value = 2, > .cpu_enable = cpu_enable_pbha, > }, > + { > + .capability = ARM64_HAS_PBHA_STAGE2, > + .type = ARM64_CPUCAP_SYSTEM_FEATURE, > + .matches = plat_can_use_pbha_stage2, > + .midr_range_list = pbha_stage2_wins, > + }, > #endif > {}, > }; > diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h > index c96a9a0043bf..d6abdc53f4d9 100644 > --- a/arch/arm64/kernel/image-vars.h > +++ b/arch/arm64/kernel/image-vars.h > @@ -134,6 +134,9 @@ KVM_NVHE_ALIAS(__hyp_rodata_end); > /* pKVM static key */ > KVM_NVHE_ALIAS(kvm_protected_mode_initialized); > > +/* PBHA bits for stage2 */ > +KVM_NVHE_ALIAS(arm64_pbha_stage2_safe_bits); I'm not totally keen on this, see below. > + > #endif /* CONFIG_KVM */ > > #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > index 7bd90ea1c61f..c0bfef55a1ff 100644 > --- a/arch/arm64/kvm/hyp/pgtable.c > +++ b/arch/arm64/kvm/hyp/pgtable.c > @@ -520,7 +520,7 @@ struct stage2_map_data { > u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) > { > u64 vtcr = VTCR_EL2_FLAGS; > - u8 lvls; > + u8 lvls, pbha = 0xf; > > vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; > vtcr |= VTCR_EL2_T0SZ(phys_shift); > @@ -545,9 +545,13 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) > * value will always be 0, which is defined as the safe default > * setting. On Cortex cores, enabling PBHA for stage2 effectively > * disables it for stage1. > + * When the HAS_PBHA_STAGE2 feature is supported, clear the 'safe' > + * bits to allow the guest's stage1 to use these bits. > */ > + if (cpus_have_final_cap(ARM64_HAS_PBHA_STAGE2)) > + pbha = pbha ^ arm64_pbha_stage2_safe_bits; > if (cpus_have_final_cap(ARM64_HAS_PBHA)) > - vtcr = FIELD_PREP(VTCR_EL2_PBHA_MASK, 0xf); > + vtcr = FIELD_PREP(VTCR_EL2_PBHA_MASK, pbha); ORing bug aside, this isn't great in the protected case, where we ultimately don't want to trust a value that is under EL1 control for page tables. I'd suggest reusing the hack we have for things like kimage_voffset, where we generate the value as an immediate that gets inlined. Thanks, M. -- Without deviation from the norm, progress is not possible. _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/mailman/listinfo/kvmarm