Re: [PATCH v4 03/14] arm64/fpsimd: Support FEAT_FPMR

Marc Zyngier <maz@xxxxxxxxxx> · Fri, 23 Feb 2024 11:07:02 +0000

On Mon, 22 Jan 2024 16:28:06 +0000,
Mark Brown <broonie@xxxxxxxxxx> wrote:
> 
> FEAT_FPMR defines a new EL0 accessible register FPMR use to configure the
> FP8 related features added to the architecture at the same time. Detect
> support for this register and context switch it for EL0 when present.
> 
> Due to the sharing of responsibility for saving floating point state
> between the host kernel and KVM FP8 support is not yet implemented in KVM
> and a stub similar to that used for SVCR is provided for FPMR in order to
> avoid bisection issues. To make it easier to share host state with the
> hypervisor we store FPMR as a hardened usercopy field in uw (along with
> some padding).
> 
> Signed-off-by: Mark Brown <broonie@xxxxxxxxxx>
> ---
>  arch/arm64/include/asm/cpufeature.h |  5 +++++
>  arch/arm64/include/asm/fpsimd.h     |  2 ++
>  arch/arm64/include/asm/kvm_host.h   |  1 +
>  arch/arm64/include/asm/processor.h  |  4 ++++
>  arch/arm64/kernel/cpufeature.c      |  9 +++++++++
>  arch/arm64/kernel/fpsimd.c          | 13 +++++++++++++
>  arch/arm64/kvm/fpsimd.c             |  1 +
>  arch/arm64/tools/cpucaps            |  1 +
>  8 files changed, 36 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> index 21c824edf8ce..34fcdbc65d7d 100644
> --- a/arch/arm64/include/asm/cpufeature.h
> +++ b/arch/arm64/include/asm/cpufeature.h
> @@ -768,6 +768,11 @@ static __always_inline bool system_supports_tpidr2(void)
>  	return system_supports_sme();
>  }
>  
> +static __always_inline bool system_supports_fpmr(void)
> +{
> +	return alternative_has_cap_unlikely(ARM64_HAS_FPMR);
> +}
> +
>  static __always_inline bool system_supports_cnp(void)
>  {
>  	return alternative_has_cap_unlikely(ARM64_HAS_CNP);
> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
> index 50e5f25d3024..6cf72b0d2c04 100644
> --- a/arch/arm64/include/asm/fpsimd.h
> +++ b/arch/arm64/include/asm/fpsimd.h
> @@ -89,6 +89,7 @@ struct cpu_fp_state {
>  	void *sve_state;
>  	void *sme_state;
>  	u64 *svcr;
> +	unsigned long *fpmr;
>  	unsigned int sve_vl;
>  	unsigned int sme_vl;
>  	enum fp_type *fp_type;
> @@ -154,6 +155,7 @@ extern void cpu_enable_sve(const struct arm64_cpu_capabilities *__unused);
>  extern void cpu_enable_sme(const struct arm64_cpu_capabilities *__unused);
>  extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused);
>  extern void cpu_enable_fa64(const struct arm64_cpu_capabilities *__unused);
> +extern void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__unused);
>  
>  extern u64 read_smcr_features(void);
>  
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 21c57b812569..7993694a54af 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -543,6 +543,7 @@ struct kvm_vcpu_arch {
>  	enum fp_type fp_type;
>  	unsigned int sve_max_vl;
>  	u64 svcr;
> +	unsigned long fpmr;

As this directly represents a register, I'd rather you use a type that
represents the size of that register unambiguously (u64).

>  
>  	/* Stage 2 paging state used by the hardware on next switch */
>  	struct kvm_s2_mmu *hw_mmu;
> diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
> index 5b0a04810b23..b453c66d3fae 100644
> --- a/arch/arm64/include/asm/processor.h
> +++ b/arch/arm64/include/asm/processor.h
> @@ -155,6 +155,8 @@ struct thread_struct {
>  	struct {
>  		unsigned long	tp_value;	/* TLS register */
>  		unsigned long	tp2_value;
> +		unsigned long	fpmr;
> +		unsigned long	pad;
>  		struct user_fpsimd_state fpsimd_state;
>  	} uw;
>  
> @@ -253,6 +255,8 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
>  	BUILD_BUG_ON(sizeof_field(struct thread_struct, uw) !=
>  		     sizeof_field(struct thread_struct, uw.tp_value) +
>  		     sizeof_field(struct thread_struct, uw.tp2_value) +
> +		     sizeof_field(struct thread_struct, uw.fpmr) +
> +		     sizeof_field(struct thread_struct, uw.pad) +
>  		     sizeof_field(struct thread_struct, uw.fpsimd_state));
>  
>  	*offset = offsetof(struct thread_struct, uw);
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index eae59ec0f4b0..0263565f617a 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -272,6 +272,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
>  };
>  
>  static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = {
> +	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0),
>  	ARM64_FTR_END,
>  };
>  
> @@ -2767,6 +2768,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
>  		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
>  		.matches = has_lpa2,
>  	},
> +	{
> +		.desc = "FPMR",
> +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
> +		.capability = ARM64_HAS_FPMR,
> +		.matches = has_cpuid_feature,
> +		.cpu_enable = cpu_enable_fpmr,
> +		ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, FPMR, IMP)
> +	},
>  	{},
>  };
>  
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index a5dc6f764195..8e24b5e5e192 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -359,6 +359,9 @@ static void task_fpsimd_load(void)
>  	WARN_ON(preemptible());
>  	WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE));
>  
> +	if (system_supports_fpmr())
> +		write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR);
> +
>  	if (system_supports_sve() || system_supports_sme()) {
>  		switch (current->thread.fp_type) {
>  		case FP_STATE_FPSIMD:
> @@ -446,6 +449,9 @@ static void fpsimd_save_user_state(void)
>  	if (test_thread_flag(TIF_FOREIGN_FPSTATE))
>  		return;
>  
> +	if (system_supports_fpmr())
> +		*(last->fpmr) = read_sysreg_s(SYS_FPMR);
> +
>  	/*
>  	 * If a task is in a syscall the ABI allows us to only
>  	 * preserve the state shared with FPSIMD so don't bother
> @@ -688,6 +694,12 @@ static void sve_to_fpsimd(struct task_struct *task)
>  	}
>  }
>  
> +void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p)
> +{
> +	write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK,
> +		       SYS_SCTLR_EL1);
> +}
> +
>  #ifdef CONFIG_ARM64_SVE
>  /*
>   * Call __sve_free() directly only if you know task can't be scheduled
> @@ -1680,6 +1692,7 @@ static void fpsimd_bind_task_to_cpu(void)
>  	last->sve_vl = task_get_sve_vl(current);
>  	last->sme_vl = task_get_sme_vl(current);
>  	last->svcr = &current->thread.svcr;
> +	last->fpmr = &current->thread.uw.fpmr;
>  	last->fp_type = &current->thread.fp_type;
>  	last->to_save = FP_STATE_CURRENT;
>  	current->thread.fpsimd_cpu = smp_processor_id();
> diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
> index 8c1d0d4853df..e3e611e30e91 100644
> --- a/arch/arm64/kvm/fpsimd.c
> +++ b/arch/arm64/kvm/fpsimd.c
> @@ -153,6 +153,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
>  		fp_state.sve_vl = vcpu->arch.sve_max_vl;
>  		fp_state.sme_state = NULL;
>  		fp_state.svcr = &vcpu->arch.svcr;
> +		fp_state.fpmr = &vcpu->arch.fpmr;
>  		fp_state.fp_type = &vcpu->arch.fp_type;

Given the number of fields you keep track of, it would make a lot more
sense if these FP-related fields were in their own little structure
and tracked by a single pointer (I don't think there is a case where
we track them independently).

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.