Re: [PATCH] MIPS: MSA: misaligned support

James Hogan <james.hogan@xxxxxxxxxx> · Wed, 18 Mar 2015 11:27:25 +0000

Hi Leonid,

On 18/03/15 01:16, Leonid Yegoshin wrote:
> MIPS R5, MIPS R6 and MSA HW specs allow a broad range of address exception
> on unalaigned MSA load/store operations - from none unaligned up to

unaligned

> full support in HW. In practice, it is expected that HW can occasionally
> triggers AdE for non-aligned data access (misalignment). It is usually
> expected on page boundaries because HW handling of two TLBs in single
> data access operation may be complicated and expensive.
> 
> So, this patch handles MSA LD.df and ST.df Address Error exceptions.
> 
> It handles separately two cases - MSA owned by thread and MSA registers
> saved in current->thread.fpu. If thread still ownes MSA unit then it

owns

> loads and stores directly with MSA unit and only one MSA register. Saving
> and restoring the full MSA context (512bytes) on each misalign exception
> is expensive! Preemption is disabled, of course.
> 
> Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@xxxxxxxxxx>
> ---
>  arch/mips/include/asm/processor.h |    2 +
>  arch/mips/include/uapi/asm/inst.h |   21 +++++
>  arch/mips/kernel/r4k_fpu.S        |  107 ++++++++++++++++++++++++++++
>  arch/mips/kernel/unaligned.c      |  143 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 273 insertions(+)
> 
> diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
> index f1df4cb4a286..af2675060244 100644
> --- a/arch/mips/include/asm/processor.h
> +++ b/arch/mips/include/asm/processor.h
> @@ -104,6 +104,8 @@ extern unsigned int vced_count, vcei_count;
>  #endif
>  
>  union fpureg {
> +	__u8    val8[FPU_REG_WIDTH / 8];
> +	__u16   val16[FPU_REG_WIDTH / 16];
>  	__u32	val32[FPU_REG_WIDTH / 32];
>  	__u64	val64[FPU_REG_WIDTH / 64];
>  };
> diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
> index 89c22433b1c6..7ab6987cb7d5 100644
> --- a/arch/mips/include/uapi/asm/inst.h
> +++ b/arch/mips/include/uapi/asm/inst.h
> @@ -58,6 +58,7 @@ enum spec_op {
>  	dsll_op, spec7_unused_op, dsrl_op, dsra_op,
>  	dsll32_op, spec8_unused_op, dsrl32_op, dsra32_op
>  };
> +#define msa_op  mdmx_op
>  
>  /*
>   * func field of spec2 opcode.
> @@ -217,6 +218,14 @@ enum bshfl_func {
>  };
>  
>  /*
> + * func field for MSA MI10 format
> + */
> +enum msa_mi10_func {
> +	msa_ld_op = 8,
> +	msa_st_op = 9,

Most other opcode enumerations in this file are specified in hexadecimal.

> +};
> +
> +/*
>   * (microMIPS) Major opcodes.
>   */
>  enum mm_major_op {
> @@ -616,6 +625,17 @@ struct spec3_format {   /* SPEC3 */
>  	;)))))
>  };
>  
> +struct msa_mi10_format {        /* MSA */
> +	__BITFIELD_FIELD(unsigned int opcode : 6,
> +	__BITFIELD_FIELD(signed int s10 : 10,
> +	__BITFIELD_FIELD(unsigned int rs : 5,
> +	__BITFIELD_FIELD(unsigned int wd : 5,
> +	__BITFIELD_FIELD(unsigned int func : 4,
> +	__BITFIELD_FIELD(unsigned int df : 2,
> +	;))))))
> +};
> +
> +
>  /*
>   * microMIPS instruction formats (32-bit length)
>   *
> @@ -884,6 +904,7 @@ union mips_instruction {
>  	struct p_format p_format;
>  	struct f_format f_format;
>  	struct ma_format ma_format;
> +	struct msa_mi10_format msa_mi10_format;
>  	struct b_format b_format;
>  	struct ps_format ps_format;
>  	struct v_format v_format;
> diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
> index 6c160c67984c..5f48f45f81e7 100644
> --- a/arch/mips/kernel/r4k_fpu.S
> +++ b/arch/mips/kernel/r4k_fpu.S
> @@ -13,6 +13,7 @@
>   * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
>   */
>  #include <asm/asm.h>
> +#include <asm/asmmacro.h>
>  #include <asm/errno.h>
>  #include <asm/fpregdef.h>
>  #include <asm/mipsregs.h>
> @@ -268,6 +269,112 @@ LEAF(_restore_fp_context32)
>  	END(_restore_fp_context32)
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +
> +	.macro  msa_ld_d    wd, base
> +	ld_d    \wd, 0, \base
> +	jalr    $0, $31

Why not just:
	jr	ra

like every other function in that file? I hope jr would be encoded
correctly on r6 automatically?

> +	  nop

I think a single extra space of indentation for delay slots is the
convention, rather than 2. Same below.

> +	.align  4

doesn't this mean the first one & label might not be suitably aligned.
Would it be better to put this before the ld_d (no need for it after
$w31 case) and putting another .align 4 before the Lmsa_to and Lmsa_from
labels (so the label itself is aligned)?

> +	.endm
> +
> +	.macro  msa_st_d    wd, base
> +	st_d    \wd, 0, \base
> +	jalr    $0, $31
> +	  nop
> +	.align  4

same comments as above.

> +	.endm
> +
> +LEAF(msa_to_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_to
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0

Likewise here, "jr t0"? and same for msa_from_wd

> +	  nop
> +Lmsa_to:
> +	msa_ld_d    0, a1
> +	msa_ld_d    1, a1
> +	msa_ld_d    2, a1
> +	msa_ld_d    3, a1
> +	msa_ld_d    4, a1
> +	msa_ld_d    5, a1
> +	msa_ld_d    6, a1
> +	msa_ld_d    7, a1
> +	msa_ld_d    8, a1
> +	msa_ld_d    9, a1
> +	msa_ld_d    10, a1
> +	msa_ld_d    11, a1
> +	msa_ld_d    12, a1
> +	msa_ld_d    13, a1
> +	msa_ld_d    14, a1
> +	msa_ld_d    15, a1
> +	msa_ld_d    16, a1
> +	msa_ld_d    17, a1
> +	msa_ld_d    18, a1
> +	msa_ld_d    19, a1
> +	msa_ld_d    20, a1
> +	msa_ld_d    21, a1
> +	msa_ld_d    22, a1
> +	msa_ld_d    23, a1
> +	msa_ld_d    24, a1
> +	msa_ld_d    25, a1
> +	msa_ld_d    26, a1
> +	msa_ld_d    27, a1
> +	msa_ld_d    28, a1
> +	msa_ld_d    29, a1
> +	msa_ld_d    30, a1
> +	msa_ld_d    31, a1
> +	.set    pop
> +	END(msa_to_wd)
> +
> +LEAF(msa_from_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_from
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0
> +	  nop
> +Lmsa_from:
> +	msa_st_d    0, a1
> +	msa_st_d    1, a1
> +	msa_st_d    2, a1
> +	msa_st_d    3, a1
> +	msa_st_d    4, a1
> +	msa_st_d    5, a1
> +	msa_st_d    6, a1
> +	msa_st_d    7, a1
> +	msa_st_d    8, a1
> +	msa_st_d    9, a1
> +	msa_st_d    10, a1
> +	msa_st_d    11, a1
> +	msa_st_d    12, a1
> +	msa_st_d    13, a1
> +	msa_st_d    14, a1
> +	msa_st_d    15, a1
> +	msa_st_d    16, a1
> +	msa_st_d    17, a1
> +	msa_st_d    18, a1
> +	msa_st_d    19, a1
> +	msa_st_d    20, a1
> +	msa_st_d    21, a1
> +	msa_st_d    22, a1
> +	msa_st_d    23, a1
> +	msa_st_d    24, a1
> +	msa_st_d    25, a1
> +	msa_st_d    26, a1
> +	msa_st_d    27, a1
> +	msa_st_d    28, a1
> +	msa_st_d    29, a1
> +	msa_st_d    30, a1
> +	msa_st_d    31, a1
> +	.set    pop
> +	END(msa_from_wd)
> +
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	.set	reorder
>  
>  	.type	fault@function
> diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
> index e11906dff885..558f41fa93c5 100644
> --- a/arch/mips/kernel/unaligned.c
> +++ b/arch/mips/kernel/unaligned.c
> @@ -108,6 +108,11 @@ static u32 unaligned_action;
>  #endif
>  extern void show_registers(struct pt_regs *regs);
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +void msa_to_wd(unsigned int wd, union fpureg *from);
> +void msa_from_wd(unsigned int wd, union fpureg *to);
> +#endif
> +
>  #ifdef __BIG_ENDIAN
>  #define     LoadHW(addr, value, res)  \
>  		__asm__ __volatile__ (".set\tnoat\n"        \
> @@ -422,6 +427,64 @@ extern void show_registers(struct pt_regs *regs);
>  		: "r" (value), "r" (addr), "i" (-EFAULT));
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +#ifdef __BIG_ENDIAN
> +/*
> + * MSA data format conversion.
> + * Only for BIG ENDIAN - LITTLE ENDIAN has register format which matches memory
> + * layout contiguosly.

contiguously

> + *
> + * Conversion is done between two Double words and other formats (W/H/B)
> + * because kernel uses LD.D and ST.D to load/store MSA registers and keeps
> + * MSA registers in this format in current->thread.fpu.fpr
> + */
> +static void msa_convert(union fpureg *to, union fpureg *from, int fmt)
> +{
> +	switch (fmt) {
> +	case 0: /* byte */
> +		to->val8[0] = from->val8[7];
> +		to->val8[1] = from->val8[6];
> +		to->val8[2] = from->val8[5];
> +		to->val8[3] = from->val8[4];
> +		to->val8[4] = from->val8[3];
> +		to->val8[5] = from->val8[2];
> +		to->val8[6] = from->val8[1];
> +		to->val8[7] = from->val8[0];
> +		to->val8[8] = from->val8[15];
> +		to->val8[9] = from->val8[14];
> +		to->val8[10] = from->val8[13];
> +		to->val8[11] = from->val8[12];
> +		to->val8[12] = from->val8[11];
> +		to->val8[13] = from->val8[10];
> +		to->val8[14] = from->val8[9];
> +		to->val8[15] = from->val8[8];
> +		break;
> +
> +	case 1: /* halfword */
> +		to->val16[0] = from->val16[3];
> +		to->val16[1] = from->val16[2];
> +		to->val16[2] = from->val16[1];
> +		to->val16[3] = from->val16[0];
> +		to->val16[4] = from->val16[7];
> +		to->val16[5] = from->val16[6];
> +		to->val16[6] = from->val16[5];
> +		to->val16[7] = from->val16[4];
> +		break;
> +
> +	case 2: /* word */
> +		to->val32[0] = from->val32[1];
> +		to->val32[1] = from->val32[0];
> +		to->val32[2] = from->val32[3];
> +		to->val32[3] = from->val32[2];

FWIW since the FP/MSA patches that Paul submitted, there are also
working endian agnostic accessors created with BUILD_FPR_ACCESS, which
use the FPR_IDX macro (see http://patchwork.linux-mips.org/patch/9169/),
which should work for 8bit and 16bit sizes too.

I wonder if the compiler would unroll/optimise this sort of thing:
	for (i = 0; i < (FPU_REG_WIDTH / 8); ++i)
		to_val8[i] = from->val[FPR_IDX(8, i)];

No worries if not.

> +		break;
> +
> +	case 3: /* doubleword, no conversion */
> +		break;

don't you still need to copy the value though?

> +	}
> +}
> +#endif
> +#endif
> +
>  static void emulate_load_store_insn(struct pt_regs *regs,
>  	void __user *addr, unsigned int __user *pc)
>  {
> @@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  #ifdef	CONFIG_EVA
>  	mm_segment_t seg;
>  #endif
> +#ifdef CONFIG_CPU_HAS_MSA
> +	union fpureg msadatabase[2], *msadata;
> +	unsigned int func, df, rs, wd;
> +#endif
>  	origpc = (unsigned long)pc;
>  	orig31 = regs->regs[31];
>  
> @@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  			break;
>  		return;
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +	case msa_op:
> +		if (cpu_has_mdmx)
> +			goto sigill;
> +
> +		func = insn.msa_mi10_format.func;
> +		switch (func) {
> +		default:
> +			goto sigbus;
> +
> +		case msa_ld_op:
> +		case msa_st_op:
> +			;
> +		}
> +
> +		if (!thread_msa_context_live())
> +			goto sigbus;

Will this ever happen? (I can't see AdE handler enabling interrupts).

If the MSA context genuinely isn't live (i.e. it can be considered
UNPREDICTABLE), then surely a load operation should still succeed?

> +
> +		df = insn.msa_mi10_format.df;
> +		rs = insn.msa_mi10_format.rs;
> +		wd = insn.msa_mi10_format.wd;
> +		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));

"* (1 << df)"?
why not just "<< df"?

> +		/* align a working space in stack... */
> +		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);

Maybe you could just use __aligned(16) on a single local union fpureg.

> +		if (func == msa_ld_op) {
> +			if (!access_ok(VERIFY_READ, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);
> +			res = __copy_from_user_inatomic(msadata, addr, 16);
> +			if (res)
> +				goto fault;
> +			preempt_disable();
> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +				msa_to_wd(wd, &current->thread.fpu.fpr[wd]);
> +#else
> +				msa_to_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +#else
> +				current->thread.fpu.fpr[wd] = *msadata;
> +#endif

I'm not a fan of the ifdefs, but i can see its awkward to abstract
msa_convert without causing extra copies (although I don't think its a
critical code path).

> +			}
> +		} else {
> +			if (!access_ok(VERIFY_WRITE, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);

forgot to preempt_disable()?

> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_from_wd(wd, &current->thread.fpu.fpr[wd]);
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				msa_from_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				*msadata = current->thread.fpu.fpr[wd];

hmm, you could cheat and change this to the following?:
				msadata = &current->thread.fpu.fpr[wd];

> +#endif
> +			}
> +			res = __copy_to_user_inatomic(addr, msadata, 16);
> +			if (res)
> +				goto fault;
> +		}
> +
> +		break;
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	/*
>  	 * COP2 is available to implementor for application specific use.
>  	 * It's up to applications to register a notifier chain and do
> 
> 

Cheers
James

Attachment:
signature.asc

Description: OpenPGP digital signature