Improve AES/XTS performance of 6-way unrolling for PowerPC up to 17% with tcrypt. This is done by using one instruction, vpermxor, to replace xor and vsldoi. This patch has been tested with the kernel crypto module tcrypt.ko and has passed the selftest. The patch is also tested with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled. Signed-off-by: Danny Tsen <dtsen@xxxxxxxxxxxxx> --- drivers/crypto/vmx/aesp8-ppc.pl | 141 +++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 49 deletions(-) diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl index 50a0a18f35da..f729589d792e 100644 --- a/drivers/crypto/vmx/aesp8-ppc.pl +++ b/drivers/crypto/vmx/aesp8-ppc.pl @@ -132,11 +132,12 @@ rcon: .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev .long 0,0,0,0 ?asis +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe Lconsts: mflr r0 bcl 20,31,\$+4 mflr $ptr #vvvvv "distance between . and rcon - addi $ptr,$ptr,-0x48 + addi $ptr,$ptr,-0x58 mtlr r0 blr .long 0 @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x: li $x70,0x70 mtspr 256,r0 + xxlor 2, 32+$eighty7, 32+$eighty7 + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87 + xxlor 1, 32+$eighty7, 32+$eighty7 + + # Load XOR Lconsts. + mr $x70, r6 + bl Lconsts + lxvw4x 0, $x40, r6 # load XOR contents + mr r6, $x70 + li $x70,0x70 + subi $rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule @@ -2537,69 +2549,77 @@ Load_xts_enc_key: ?vperm v31,v31,$twk5,$keyperm lvx v25,$x10,$key_ # pre-load round[2] + # Switch to use the following codes with 0x010101..87 to generate tweak. + # eighty7 = 0x010101..87 + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits + # vand tmp, tmp, eighty7 # last byte with carry + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2) + # xxlor vsx, 0, 0 + # vpermxor tweak, tweak, tmp, vsx + vperm $in0,$inout,$inptail,$inpperm subi $inp,$inp,31 # undo "caller" vxor $twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 - vxor $tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in1 lvx_u $in1,$x10,$inp vxor $twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand $tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 - vxor $tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in2 lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor $twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand $tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 - vxor $tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in3 lvx_u $in3,$x30,$inp sub $len,$len,$taillen vxor $twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand $tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 - vxor $tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in4 lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor $twk4,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in4,$in4,$in4,$leperm vand $tmp,$tmp,$eighty7 vxor $out4,$in4,$twk4 - vxor $tweak,$tweak,$tmp + xxlor 32+$in5, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in5 lvx_u $in5,$x50,$inp addi $inp,$inp,0x60 vxor $twk5,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in5,$in5,$in5,$leperm vand $tmp,$tmp,$eighty7 vxor $out5,$in5,$twk5 - vxor $tweak,$tweak,$tmp + xxlor 32+$in0, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in0 vxor v31,v31,$rndkey0 mtctr $rounds @@ -2625,6 +2645,8 @@ Loop_xts_enc6x: lvx v25,$x10,$key_ # round[4] bdnz Loop_xts_enc6x + xxlor 32+$eighty7, 1, 1 # 0x010101..87 + subic $len,$len,96 # $len-=96 vxor $in0,$twk0,v31 # xor with last round key vcipher $out0,$out0,v24 @@ -2634,7 +2656,6 @@ Loop_xts_enc6x: vaddubm $tweak,$tweak,$tweak vcipher $out2,$out2,v24 vcipher $out3,$out3,v24 - vsldoi $tmp,$tmp,$tmp,15 vcipher $out4,$out4,v24 vcipher $out5,$out5,v24 @@ -2642,7 +2663,8 @@ Loop_xts_enc6x: vand $tmp,$tmp,$eighty7 vcipher $out0,$out0,v25 vcipher $out1,$out1,v25 - vxor $tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in1 vcipher $out2,$out2,v25 vcipher $out3,$out3,v25 vxor $in1,$twk1,v31 @@ -2653,13 +2675,13 @@ Loop_xts_enc6x: and r0,r0,$len vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vcipher $out0,$out0,v26 vcipher $out1,$out1,v26 vand $tmp,$tmp,$eighty7 vcipher $out2,$out2,v26 vcipher $out3,$out3,v26 - vxor $tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in2 vcipher $out4,$out4,v26 vcipher $out5,$out5,v26 @@ -2673,7 +2695,6 @@ Loop_xts_enc6x: vaddubm $tweak,$tweak,$tweak vcipher $out0,$out0,v27 vcipher $out1,$out1,v27 - vsldoi $tmp,$tmp,$tmp,15 vcipher $out2,$out2,v27 vcipher $out3,$out3,v27 vand $tmp,$tmp,$eighty7 @@ -2681,7 +2702,8 @@ Loop_xts_enc6x: vcipher $out5,$out5,v27 addi $key_,$sp,$FRAME+15 # rewind $key_ - vxor $tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in3 vcipher $out0,$out0,v28 vcipher $out1,$out1,v28 vxor $in3,$twk3,v31 @@ -2690,7 +2712,6 @@ Loop_xts_enc6x: vcipher $out2,$out2,v28 vcipher $out3,$out3,v28 vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vcipher $out4,$out4,v28 vcipher $out5,$out5,v28 lvx v24,$x00,$key_ # re-pre-load round[1] @@ -2698,7 +2719,8 @@ Loop_xts_enc6x: vcipher $out0,$out0,v29 vcipher $out1,$out1,v29 - vxor $tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in4 vcipher $out2,$out2,v29 vcipher $out3,$out3,v29 vxor $in4,$twk4,v31 @@ -2708,14 +2730,14 @@ Loop_xts_enc6x: vcipher $out5,$out5,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vcipher $out0,$out0,v30 vcipher $out1,$out1,v30 vand $tmp,$tmp,$eighty7 vcipher $out2,$out2,v30 vcipher $out3,$out3,v30 - vxor $tweak,$tweak,$tmp + xxlor 32+$in5, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in5 vcipher $out4,$out4,v30 vcipher $out5,$out5,v30 vxor $in5,$twk5,v31 @@ -2725,7 +2747,6 @@ Loop_xts_enc6x: vcipherlast $out0,$out0,$in0 lvx_u $in0,$x00,$inp # load next input block vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vcipherlast $out1,$out1,$in1 lvx_u $in1,$x10,$inp vcipherlast $out2,$out2,$in2 @@ -2738,7 +2759,10 @@ Loop_xts_enc6x: vcipherlast $out4,$out4,$in4 le?vperm $in2,$in2,$in2,$leperm lvx_u $in4,$x40,$inp - vxor $tweak,$tweak,$tmp + xxlor 10, 32+$in0, 32+$in0 + xxlor 32+$in0, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in0 + xxlor 32+$in0, 10, 10 vcipherlast $tmp,$out5,$in5 # last block might be needed # in stealing mode le?vperm $in3,$in3,$in3,$leperm @@ -2771,6 +2795,8 @@ Loop_xts_enc6x: mtctr $rounds beq Loop_xts_enc6x # did $len-=96 borrow? + xxlor 32+$eighty7, 2, 2 # 0x010101..87 + addic. $len,$len,0x60 beq Lxts_enc6x_zero cmpwi $len,0x20 @@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x: li $x70,0x70 mtspr 256,r0 + xxlor 2, 32+$eighty7, 32+$eighty7 + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87 + xxlor 1, 32+$eighty7, 32+$eighty7 + + # Load XOR Lconsts. + mr $x70, r6 + bl Lconsts + lxvw4x 0, $x40, r6 # load XOR contents + mr r6, $x70 + li $x70,0x70 + subi $rounds,$rounds,3 # -4 in total lvx $rndkey0,$x00,$key1 # load key schedule @@ -3194,64 +3231,64 @@ Load_xts_dec_key: vxor $twk0,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vand $tmp,$tmp,$eighty7 vxor $out0,$in0,$twk0 - vxor $tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in1 lvx_u $in1,$x10,$inp vxor $twk1,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in1,$in1,$in1,$leperm vand $tmp,$tmp,$eighty7 vxor $out1,$in1,$twk1 - vxor $tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in2 lvx_u $in2,$x20,$inp andi. $taillen,$len,15 vxor $twk2,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in2,$in2,$in2,$leperm vand $tmp,$tmp,$eighty7 vxor $out2,$in2,$twk2 - vxor $tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in3 lvx_u $in3,$x30,$inp sub $len,$len,$taillen vxor $twk3,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in3,$in3,$in3,$leperm vand $tmp,$tmp,$eighty7 vxor $out3,$in3,$twk3 - vxor $tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in4 lvx_u $in4,$x40,$inp subi $len,$len,0x60 vxor $twk4,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in4,$in4,$in4,$leperm vand $tmp,$tmp,$eighty7 vxor $out4,$in4,$twk4 - vxor $tweak,$tweak,$tmp + xxlor 32+$in5, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in5 lvx_u $in5,$x50,$inp addi $inp,$inp,0x60 vxor $twk5,$tweak,$rndkey0 vsrab $tmp,$tweak,$seven # next tweak value vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 le?vperm $in5,$in5,$in5,$leperm vand $tmp,$tmp,$eighty7 vxor $out5,$in5,$twk5 - vxor $tweak,$tweak,$tmp + xxlor 32+$in0, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in0 vxor v31,v31,$rndkey0 mtctr $rounds @@ -3277,6 +3314,8 @@ Loop_xts_dec6x: lvx v25,$x10,$key_ # round[4] bdnz Loop_xts_dec6x + xxlor 32+$eighty7, 1, 1 # 0x010101..87 + subic $len,$len,96 # $len-=96 vxor $in0,$twk0,v31 # xor with last round key vncipher $out0,$out0,v24 @@ -3286,7 +3325,6 @@ Loop_xts_dec6x: vaddubm $tweak,$tweak,$tweak vncipher $out2,$out2,v24 vncipher $out3,$out3,v24 - vsldoi $tmp,$tmp,$tmp,15 vncipher $out4,$out4,v24 vncipher $out5,$out5,v24 @@ -3294,7 +3332,8 @@ Loop_xts_dec6x: vand $tmp,$tmp,$eighty7 vncipher $out0,$out0,v25 vncipher $out1,$out1,v25 - vxor $tweak,$tweak,$tmp + xxlor 32+$in1, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in1 vncipher $out2,$out2,v25 vncipher $out3,$out3,v25 vxor $in1,$twk1,v31 @@ -3305,13 +3344,13 @@ Loop_xts_dec6x: and r0,r0,$len vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vncipher $out0,$out0,v26 vncipher $out1,$out1,v26 vand $tmp,$tmp,$eighty7 vncipher $out2,$out2,v26 vncipher $out3,$out3,v26 - vxor $tweak,$tweak,$tmp + xxlor 32+$in2, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in2 vncipher $out4,$out4,v26 vncipher $out5,$out5,v26 @@ -3325,7 +3364,6 @@ Loop_xts_dec6x: vaddubm $tweak,$tweak,$tweak vncipher $out0,$out0,v27 vncipher $out1,$out1,v27 - vsldoi $tmp,$tmp,$tmp,15 vncipher $out2,$out2,v27 vncipher $out3,$out3,v27 vand $tmp,$tmp,$eighty7 @@ -3333,7 +3371,8 @@ Loop_xts_dec6x: vncipher $out5,$out5,v27 addi $key_,$sp,$FRAME+15 # rewind $key_ - vxor $tweak,$tweak,$tmp + xxlor 32+$in3, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in3 vncipher $out0,$out0,v28 vncipher $out1,$out1,v28 vxor $in3,$twk3,v31 @@ -3342,7 +3381,6 @@ Loop_xts_dec6x: vncipher $out2,$out2,v28 vncipher $out3,$out3,v28 vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vncipher $out4,$out4,v28 vncipher $out5,$out5,v28 lvx v24,$x00,$key_ # re-pre-load round[1] @@ -3350,7 +3388,8 @@ Loop_xts_dec6x: vncipher $out0,$out0,v29 vncipher $out1,$out1,v29 - vxor $tweak,$tweak,$tmp + xxlor 32+$in4, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in4 vncipher $out2,$out2,v29 vncipher $out3,$out3,v29 vxor $in4,$twk4,v31 @@ -3360,14 +3399,14 @@ Loop_xts_dec6x: vncipher $out5,$out5,v29 lvx v25,$x10,$key_ # re-pre-load round[2] vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vncipher $out0,$out0,v30 vncipher $out1,$out1,v30 vand $tmp,$tmp,$eighty7 vncipher $out2,$out2,v30 vncipher $out3,$out3,v30 - vxor $tweak,$tweak,$tmp + xxlor 32+$in5, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in5 vncipher $out4,$out4,v30 vncipher $out5,$out5,v30 vxor $in5,$twk5,v31 @@ -3377,7 +3416,6 @@ Loop_xts_dec6x: vncipherlast $out0,$out0,$in0 lvx_u $in0,$x00,$inp # load next input block vaddubm $tweak,$tweak,$tweak - vsldoi $tmp,$tmp,$tmp,15 vncipherlast $out1,$out1,$in1 lvx_u $in1,$x10,$inp vncipherlast $out2,$out2,$in2 @@ -3390,7 +3428,10 @@ Loop_xts_dec6x: vncipherlast $out4,$out4,$in4 le?vperm $in2,$in2,$in2,$leperm lvx_u $in4,$x40,$inp - vxor $tweak,$tweak,$tmp + xxlor 10, 32+$in0, 32+$in0 + xxlor 32+$in0, 0, 0 + vpermxor $tweak, $tweak, $tmp, $in0 + xxlor 32+$in0, 10, 10 vncipherlast $out5,$out5,$in5 le?vperm $in3,$in3,$in3,$leperm lvx_u $in5,$x50,$inp @@ -3421,6 +3462,8 @@ Loop_xts_dec6x: mtctr $rounds beq Loop_xts_dec6x # did $len-=96 borrow? + xxlor 32+$eighty7, 2, 2 # 0x010101..87 + addic. $len,$len,0x60 beq Lxts_dec6x_zero cmpwi $len,0x20 -- 2.31.1