Re: [PATCH 2/9] MIPS: Optimise core library functions for microMIPS.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 9 Apr 2012, Steven J. Hill wrote:

> diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S
> index 606c8a9..a0df003 100644
> --- a/arch/mips/lib/memset.S
> +++ b/arch/mips/lib/memset.S
> @@ -26,23 +35,36 @@
>  	.previous
>  
>  	.macro	f_fill64 dst, offset, val, fixup
> -	EX(LONG_S, \val, (\offset +  0 * LONGSIZE)(\dst), \fixup)
> -	EX(LONG_S, \val, (\offset +  1 * LONGSIZE)(\dst), \fixup)
> -	EX(LONG_S, \val, (\offset +  2 * LONGSIZE)(\dst), \fixup)
> -	EX(LONG_S, \val, (\offset +  3 * LONGSIZE)(\dst), \fixup)
> -	EX(LONG_S, \val, (\offset +  4 * LONGSIZE)(\dst), \fixup)
> -	EX(LONG_S, \val, (\offset +  5 * LONGSIZE)(\dst), \fixup)
> -	EX(LONG_S, \val, (\offset +  6 * LONGSIZE)(\dst), \fixup)
> -	EX(LONG_S, \val, (\offset +  7 * LONGSIZE)(\dst), \fixup)
> +#ifdef CONFIG_CPU_MICROMIPS
> +	EX(swp, t8, (\offset + 0 * STORSIZE)(\dst), \fixup)
> +	EX(swp, t8, (\offset + 1 * STORSIZE)(\dst), \fixup)
> +	EX(swp, t8, (\offset + 2 * STORSIZE)(\dst), \fixup)
> +	EX(swp, t8, (\offset + 3 * STORSIZE)(\dst), \fixup)
> +#if LONGSIZE == 4
> +	EX(swp, t8, (\offset + 4 * STORSIZE)(\dst), \fixup)
> +	EX(swp, t8, (\offset + 5 * STORSIZE)(\dst), \fixup)
> +	EX(swp, t8, (\offset + 6 * STORSIZE)(\dst), \fixup)
> +	EX(swp, t8, (\offset + 7 * STORSIZE)(\dst), \fixup)
> +#endif

 This looks bogus to me, either define LONG_SP to SWP or SDP (in 
<asm/asm.h>) based on _MIPS_SZLONG or #error on LONGSIZE != 4.  Also it 
would make sense to make this macro respect val in the microMIPS mode too, 
then an appropriate #define to a1/t8 elsewhere in this file will do.  
Otherwise it's pure obfuscation.

> diff --git a/arch/mips/lib/strlen_user.S b/arch/mips/lib/strlen_user.S
> index fdbb970..60fa23b 100644
> --- a/arch/mips/lib/strlen_user.S
> +++ b/arch/mips/lib/strlen_user.S
> @@ -28,9 +29,13 @@ LEAF(__strlen_user_asm)
>  
>  FEXPORT(__strlen_user_nocheck_asm)
>  	move		v0, a0
> -1:	EX(lb, t0, (v0), .Lfault)
> +#ifdef CONFIG_CPU_MICROMIPS
> +1:	EX(lbu16, v1, (v0), .Lfault)
> +#else
> +1:	EX(lb, v1, (v0), .Lfault)
> +#endif
>  	PTR_ADDIU	v0, 1
> -	bnez		t0, 1b
> +	bnez		v1, 1b
>  	PTR_SUBU	v0, a0
>  	jr		ra
>  	END(__strlen_user_asm)

 Perhaps just LBU in all cases?  You'll avoid the ugly #ifdef.

> diff --git a/arch/mips/lib/strncpy_user.S b/arch/mips/lib/strncpy_user.S
> index 7201b2f..bcbb9a0 100644
> --- a/arch/mips/lib/strncpy_user.S
> +++ b/arch/mips/lib/strncpy_user.S
> @@ -30,30 +31,32 @@
>  LEAF(__strncpy_from_user_asm)
>  	LONG_L		v0, TI_ADDR_LIMIT($28)	# pointer ok?
>  	and		v0, a1
> +#ifdef CONFIG_CPU_MICROMIPS
> +	bnezc		v0, .Lfault
> +#else
>  	bnez		v0, .Lfault
> +#endif

 You don't need this, .Lfault is surely within 126 bytes from here so GAS 
will relax the branch to its 16-bit encoding which will give an equivalent 
result.  Relaxing to compact branches is not supported by GAS at the 
moment.

>  FEXPORT(__strncpy_from_user_nocheck_asm)
> -	move		v0, zero
> -	move		v1, a1
>  	.set		noreorder
> -1:	EX(lbu, t0, (v1), .Lfault)
> +	move		t0, zero
> +	move		v1, a1
> +1:	EX(lbu, v0, (v1), .Lfault)
>  	PTR_ADDIU	v1, 1
>  	R10KCBARRIER(0(ra))
> -	beqz		t0, 2f
> -	 sb		t0, (a0)
> -	PTR_ADDIU	v0, 1
> -	.set		reorder
> -	PTR_ADDIU	a0, 1
> -	bne		v0, a2, 1b
> -2:	PTR_ADDU	t0, a1, v0
> -	xor		t0, a1
> -	bltz		t0, .Lfault
> +	beqz		v0, 2f
> +	 sb		v0, (a0)
> +	PTR_ADDIU	t0, 1
> +	bne		t0, a2, 1b
> +	 PTR_ADDIU	a0, 1
> +2:	PTR_ADDU	v0, a1, t0
> +	xor		v0, a1
> +	bltz		v0, .Lfault
> +	 nop
>  	jr		ra			# return n
> +	move		v0, t0
>  	END(__strncpy_from_user_asm)

 You don't need all of this, GAS will schedule these delay slots just fine 
in the microMIPS mode.

> -.Lfault:	li		v0, -EFAULT
> +.Lfault:
>  	jr		ra
> -
> -	.section	__ex_table,"a"
> -	PTR		1b, .Lfault
> -	.previous
> +	 li		v0, -EFAULT

 It looks like an independent bug fix -- submit separately?

> diff --git a/arch/mips/lib/strnlen_user.S b/arch/mips/lib/strnlen_user.S
> index 6445716..9090ced 100644
> --- a/arch/mips/lib/strnlen_user.S
> +++ b/arch/mips/lib/strnlen_user.S
> @@ -26,21 +27,34 @@
>   *       the maximum is a tad hairier ...
>   */
>  LEAF(__strnlen_user_asm)
> +	.set	noreorder
>  	LONG_L		v0, TI_ADDR_LIMIT($28)	# pointer ok?
>  	and		v0, a0
> +#ifdef CONFIG_CPU_MICROMIPS
> +	bnezc		v0, .Lfault
> +#else
>  	bnez		v0, .Lfault
> +#endif

 Again you don't need this, see above.

>  FEXPORT(__strnlen_user_nocheck_asm)
> -	move		v0, a0
>  	PTR_ADDU	a1, a0			# stop pointer
> +	move		v0, a0
>  1:	beq		v0, a1, 1f		# limit reached?
> +	 nop
>  	EX(lb, t0, (v0), .Lfault)
> -	PTR_ADDU	v0, 1
> +#ifdef CONFIG_CPU_MICROMIPS
> +	addius5		v0, 1
> +	bnezc		t0, 1b
> +1:	jr		ra
> +	PTR_SUBU	v0, a0
> +#else
>  	bnez		t0, 1b
> -1:	PTR_SUBU	v0, a0
> -	jr		ra
> +	PTR_ADDU	v0, 1
> +1:      jr              ra
> +	PTR_SUBU        v0, a0
> +#endif
>  	END(__strnlen_user_asm)
>  
>  .Lfault:
> -	move		v0, zero
>  	jr		ra
> +	move		v0, zero

 You don't need all of this if you discard .set noreorder and let GAS 
schedule the instructions and pick their optimal encodings itself.

> diff --git a/arch/mips/mm/page.c b/arch/mips/mm/page.c
> index cc0b626..be71d38 100644
> --- a/arch/mips/mm/page.c
> +++ b/arch/mips/mm/page.c
> @@ -368,6 +360,12 @@ void __cpuinit build_clear_page(void)
>  	for (i = 0; i < (buf - clear_page_array); i++)
>  		pr_debug("\t.word 0x%08x\n", clear_page_array[i]);
>  	pr_debug("\t.set pop\n");
> +#ifdef CONFIG_CPU_MICROMIPS
> +	memcpy(((u8 *)clear_page) - 1, clear_page_array,
> +		ARRAY_SIZE(clear_page_array) * 4);
> +#else
> +	memcpy(clear_page, clear_page_array, ARRAY_SIZE(clear_page_array) * 4);
> +#endif
>  }
>  
>  static void __cpuinit build_copy_load(u32 **buf, int reg, int off)
> @@ -607,6 +605,12 @@ void __cpuinit build_copy_page(void)
>  	for (i = 0; i < (buf - copy_page_array); i++)
>  		pr_debug("\t.word 0x%08x\n", copy_page_array[i]);
>  	pr_debug("\t.set pop\n");
> +#ifdef CONFIG_CPU_MICROMIPS
> +	memcpy(((u8 *)copy_page) - 1, copy_page_array,
> +		ARRAY_SIZE(copy_page_array) * 4);
> +#else
> +	memcpy(copy_page, copy_page_array, ARRAY_SIZE(copy_page_array) * 4);
> +#endif
>  }
>  
>  #ifdef CONFIG_SIBYTE_DMA_PAGEOPS

 I suggest that instead of the hacks above you arrange for an alias to 
copy_page with the ISA bit clear.  You won't need these changes then (and 
will get slightly better machine code).

  Maciej



[Index of Archives]     [Linux MIPS Home]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Linux]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux