Hi, Sebastian, The files attached is the separated patches, from step1 to step 7. Thank you very much for your help. Best Regards, Huang Ying On Wed, 2008-04-16 at 20:40 +0200, Sebastian Siewior wrote: > cut Alexander Kjeldaas <astor@xxxxxxx> from CC coz his mails bounce. > > * Huang, Ying | 2008-04-16 16:19:09 [+0800]: > > >Can you help me to test these patches > >to find out the reason for degradation on AMD CPU. > Sure. > > >> >--- a/include/crypto/aes.h > >> >+++ b/include/crypto/aes.h > >> >@@ -19,6 +19,7 @@ > >> > > >> > struct crypto_aes_ctx { > >> > u32 key_length; > >> >+ u32 _pad1; > >> > >> Why is this pad required? Do you want special alignment of the keys? > > > >Because the key is loaded in 64bit in this patch, I want to align the > >key with 64bit address. > > Than this won't work all the time. To make it bulletproof > - set .cra_alignmask in the glue code properly > - use the attribute aligned thing > - retrieve your private struct via crypto_tfm_ctx_aligned() > > You might want to take a look on padlock-aes.c. The same thing is done > there but instead of crypto_tfm_ctx_aligned() a private function is > used (to let the compiler optimize most of the code away). Depending on > Herbert's mood you might get away with this as well (what would be > probably the case since you might prefer to do it asm) :) > > >> > u32 key_enc[AES_MAX_KEYLENGTH_U32]; > >> > u32 key_dec[AES_MAX_KEYLENGTH_U32]; > >> > }; > >> > > > > >Best Regards, > >Huang Ying > > > > Sebastian
--- arch/x86/crypto/aes-x86_64-asm_64.S | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -49,13 +49,17 @@ #define R9 %r9 #define R10 %r10 #define R11 %r11 +#define R12 %r12 +#define R15 %r15 +#define R16 %rsp #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ .global FUNC; \ .type FUNC,@function; \ .align 8; \ -FUNC: movq r1,r2; \ - movq r3,r4; \ +FUNC: subq $8, R16; \ + movq r3, r4; \ + movq r1, (R16); \ leaq BASE+KEY+48+4(r8),r9; \ movq r10,r11; \ movl (r7),r5 ## E; \ @@ -74,7 +78,8 @@ FUNC: movq r1,r2; \ leaq 32(r9),r9; #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ - movq r1,r2; \ + movq (R16),r2; \ + addq $8,R16; \ movq r3,r4; \ movl r5 ## E,(r9); \ movl r6 ## E,4(r9); \
--- arch/x86/crypto/aes-x86_64-asm_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -90,13 +90,13 @@ FUNC: subq $8, R16; \ #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ movzbl r2 ## H,r5 ## E; \ movzbl r2 ## L,r6 ## E; \ + movq r4,R8; \ + shrl $16,r4 ## E; \ movl TAB+1024(,r5,4),r5 ## E;\ - movw r4 ## X,r2 ## X; \ movl TAB(,r6,4),r6 ## E; \ - roll $16,r2 ## E; \ - shrl $16,r4 ## E; \ movzbl r4 ## H,r7 ## E; \ movzbl r4 ## L,r4 ## E; \ + shrl $16,r2 ## E; \ xorl OFFSET(r8),ra ## E; \ xorl OFFSET+4(r8),rb ## E; \ xorl TAB+3072(,r7,4),r5 ## E;\ @@ -123,7 +123,7 @@ FUNC: subq $8, R16; \ xorl TAB(,r1,4),r3 ## E; \ movzbl r2 ## H,r1 ## E; \ movzbl r2 ## L,r7 ## E; \ - shrl $16,r2 ## E; \ + movq R8,r2; \ xorl TAB+3072(,r1,4),r3 ## E;\ xorl TAB+2048(,r7,4),r4 ## E;\ movzbl r2 ## H,r1 ## E; \
--- arch/x86/crypto/aes-x86_64-asm_64.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -57,13 +57,13 @@ .global FUNC; \ .type FUNC,@function; \ .align 8; \ -FUNC: subq $8, R16; \ - movq r3, r4; \ - movq r1, (R16); \ +FUNC: subq $16, R16; \ + movl (r7),r5 ## E; \ leaq BASE+KEY+48+4(r8),r9; \ movq r10,r11; \ - movl (r7),r5 ## E; \ + movq r1, (R16); \ movl 4(r7),r1 ## E; \ + movq r3, 8(R16); \ movl 8(r7),r6 ## E; \ movl 12(r7),r7 ## E; \ movl BASE+0(r8),r10 ## E; \ @@ -79,11 +79,11 @@ FUNC: subq $8, R16; \ #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ movq (R16),r2; \ - addq $8,R16; \ - movq r3,r4; \ movl r5 ## E,(r9); \ + movq 8(R16),r4; \ movl r6 ## E,4(r9); \ movl r7 ## E,8(r9); \ + addq $16,R16; \ movl r8 ## E,12(r9); \ ret;
--- arch/x86/crypto/aes-x86_64-asm_64.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -99,14 +99,14 @@ FUNC: subq $16, R16; \ shrl $16,r2 ## E; \ xorl OFFSET(r8),ra ## E; \ xorl OFFSET+4(r8),rb ## E; \ + movq r3,R9; \ xorl TAB+3072(,r7,4),r5 ## E;\ xorl TAB+2048(,r4,4),r6 ## E;\ - movzbl r1 ## L,r7 ## E; \ movzbl r1 ## H,r4 ## E; \ - movl TAB+1024(,r4,4),r4 ## E;\ - movw r3 ## X,r1 ## X; \ - roll $16,r1 ## E; \ + movzbl r1 ## L,r7 ## E; \ shrl $16,r3 ## E; \ + shrl $16,r1 ## E; \ + movl TAB+1024(,r4,4),r4 ## E;\ xorl TAB(,r7,4),r5 ## E; \ movzbl r3 ## H,r7 ## E; \ movzbl r3 ## L,r3 ## E; \ @@ -114,7 +114,7 @@ FUNC: subq $16, R16; \ xorl TAB+2048(,r3,4),r5 ## E;\ movzbl r1 ## H,r7 ## E; \ movzbl r1 ## L,r3 ## E; \ - shrl $16,r1 ## E; \ + movq R9,r1; \ xorl TAB+3072(,r7,4),r6 ## E;\ movl TAB+2048(,r3,4),r3 ## E;\ movzbl r1 ## H,r7 ## E; \
--- arch/x86/crypto/aes-x86_64-asm_64.S | 8 +++++--- include/crypto/aes.h | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -57,14 +57,15 @@ .global FUNC; \ .type FUNC,@function; \ .align 8; \ -FUNC: subq $16, R16; \ +FUNC: subq $24, R16; \ movl (r7),r5 ## E; \ - leaq BASE+KEY+48+4(r8),r9; \ + leaq BASE+KEY+48+8(r8),r9; \ movq r10,r11; \ movq r1, (R16); \ movl 4(r7),r1 ## E; \ movq r3, 8(R16); \ movl 8(r7),r6 ## E; \ + movq R12, 16(R16); \ movl 12(r7),r7 ## E; \ movl BASE+0(r8),r10 ## E; \ xorl -48(r9),r5 ## E; \ @@ -82,8 +83,9 @@ FUNC: subq $16, R16; \ movl r5 ## E,(r9); \ movq 8(R16),r4; \ movl r6 ## E,4(r9); \ + movq 16(R16),R12; \ movl r7 ## E,8(r9); \ - addq $16,R16; \ + addq $24,R16; \ movl r8 ## E,12(r9); \ ret; --- a/include/crypto/aes.h +++ b/include/crypto/aes.h @@ -19,6 +19,7 @@ struct crypto_aes_ctx { u32 key_length; + u32 _pad1; u32 key_enc[AES_MAX_KEYLENGTH_U32]; u32 key_dec[AES_MAX_KEYLENGTH_U32]; };
--- arch/x86/crypto/aes-x86_64-asm_64.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -98,9 +98,8 @@ FUNC: subq $24, R16; \ movl TAB(,r6,4),r6 ## E; \ movzbl r4 ## H,r7 ## E; \ movzbl r4 ## L,r4 ## E; \ + movq OFFSET(r8),R12; \ shrl $16,r2 ## E; \ - xorl OFFSET(r8),ra ## E; \ - xorl OFFSET+4(r8),rb ## E; \ movq r3,R9; \ xorl TAB+3072(,r7,4),r5 ## E;\ xorl TAB+2048(,r4,4),r6 ## E;\ @@ -116,7 +115,9 @@ FUNC: subq $24, R16; \ xorl TAB+2048(,r3,4),r5 ## E;\ movzbl r1 ## H,r7 ## E; \ movzbl r1 ## L,r3 ## E; \ + xorq R12,ra; \ movq R9,r1; \ + shrq $32,R12; \ xorl TAB+3072(,r7,4),r6 ## E;\ movl TAB+2048(,r3,4),r3 ## E;\ movzbl r1 ## H,r7 ## E; \ @@ -126,6 +127,7 @@ FUNC: subq $24, R16; \ movzbl r2 ## H,r1 ## E; \ movzbl r2 ## L,r7 ## E; \ movq R8,r2; \ + xorq R12,rb; \ xorl TAB+3072(,r1,4),r3 ## E;\ xorl TAB+2048(,r7,4),r4 ## E;\ movzbl r2 ## H,r1 ## E; \
--- arch/x86/crypto/aes-x86_64-asm_64.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -117,6 +117,7 @@ FUNC: subq $24, R16; \ movzbl r1 ## L,r3 ## E; \ xorq R12,ra; \ movq R9,r1; \ + movq OFFSET+8(r8),R9; \ shrq $32,R12; \ xorl TAB+3072(,r7,4),r6 ## E;\ movl TAB+2048(,r3,4),r3 ## E;\ @@ -126,16 +127,17 @@ FUNC: subq $24, R16; \ xorl TAB(,r1,4),r3 ## E; \ movzbl r2 ## H,r1 ## E; \ movzbl r2 ## L,r7 ## E; \ + xorq R9,rc; \ movq R8,r2; \ + shrq $32,R9; \ xorq R12,rb; \ xorl TAB+3072(,r1,4),r3 ## E;\ xorl TAB+2048(,r7,4),r4 ## E;\ movzbl r2 ## H,r1 ## E; \ + xorq R9,rd; \ movzbl r2 ## L,r2 ## E; \ - xorl OFFSET+8(r8),rc ## E; \ - xorl OFFSET+12(r8),rd ## E; \ - xorl TAB+1024(,r1,4),r3 ## E;\ - xorl TAB(,r2,4),r4 ## E; + xorl TAB(,r2,4),r4 ## E; \ + xorl TAB+1024(,r1,4),r3 ## E; #define move_regs(r1,r2,r3,r4) \ movl r3 ## E,r1 ## E; \