On 2024/6/28 18:12, Herbert Xu wrote:
> On Fri, Jun 28, 2024 at 04:41:17PM +0800, WangYuli wrote:
>> When entering the "len & sizeof(u32)" branch, len must be less than 8.
>> So after one operation, len must be less than 4.
>> At this time, "len -= sizeof(u32)" is not necessary for 64-bit CPUs.
>>
>> A similar issue has been solved at Loongarch.
>>
>> Link:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.10-rc5&id=fea1c949f6ca5059e12de00d0483645debc5b206
>> Signed-off-by: Guan Wentao <guanwentao@xxxxxxxxxxxxx>
>> Signed-off-by: WangYuli <wangyuli@xxxxxxxxxxxxx>
>> ---
>> arch/mips/crypto/crc32-mips.c | 4 ++++
>> 1 file changed, 4 insertions(+)
>>
>> diff --git a/arch/mips/crypto/crc32-mips.c
b/arch/mips/crypto/crc32-mips.c
>> index ec6d58008f8e..505d2d897849 100644
>> --- a/arch/mips/crypto/crc32-mips.c
>> +++ b/arch/mips/crypto/crc32-mips.c
>> @@ -94,7 +94,9 @@ static u32 crc32_mips_le_hw(u32 crc_, const u8 *p,
unsigned int len)
>>
>> CRC32(crc, value, w);
>> p += sizeof(u32);
>> +#ifndef CONFIG_64BIT
>> len -= sizeof(u32);
>> +#endif
>
> First of all, did you verify that this actually makes a difference?
> Please post the actual assembly output with and without this patch.
Sure.
The left shows the assembly after applying this patch, while the right
shows the origin. ( Generated by Clang 17.0.6 )
0000000000000018 <chksum_update>: 0000000000000018 <chksum_update>:
; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); ;
ctx->crc = crc32_mips_le_hw(ctx->crc, data, length);
18: 08 00 82 8c lw $2, 0x8($4) 18: 08 00 82 8c lw
$2, 0x8($4)
; while (len >= sizeof(u64)) { ; while (len >= sizeof(u64)) {
1c: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 1c:
08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8>
20: 06 00 20 f8 bnezc $1, 0x3c <chksum_update+0x24> 20:
06 00 20 f8 bnezc $1, 0x3c <chksum_update+0x24>
; return le64_to_cpu(__get_unaligned_t(__le64, p)); ; return
le64_to_cpu(__get_unaligned_t(__le64, p));
24: 00 00 a3 dc ld $3, 0x0($5) 24: 00 00 a3 dc ld
$3, 0x0($5)
; CRC32(crc, value, d); ; CRC32(crc, value, d);
28: cf 00 62 7c <unknown> crc32d v0,v1,v0 28: cf 00 62
7c <unknown> crc32d v0,v1,v0
; len -= sizeof(u64); ; len -= sizeof(u64);
2c: f8 ff c6 24 addiu $6, $6, -0x8
<chksumc_digest+0xfffffffffffffce0> 2c: f8 ff c6 24 addiu $6,
$6, -0x8 <chksumc_digest+0xfffffffffffffd48>
; while (len >= sizeof(u64)) { ; while (len >= sizeof(u64)) {
30: 08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8> 30:
08 00 c1 2c sltiu $1, $6, 0x8 <chksum_init+0x8>
34: fb ff 20 10 beqz $1, 0x24 <chksum_update+0xc> 34: fb
ff 20 10 beqz $1, 0x24 <chksum_update+0xc>
38: 08 00 a5 64 daddiu $5, $5, 0x8 <chksum_init+0x8> 38:
08 00 a5 64 daddiu $5, $5, 0x8 <chksum_init+0x8>
; if (len & sizeof(u32)) { ; if (len & sizeof(u32)) {
3c: 04 00 c1 2c sltiu $1, $6, 0x4 <chksum_init+0x4> 3c:
04 00 c1 2c sltiu $1, $6, 0x4 <chksum_init+0x4>
40: 0a 00 20 10 beqz $1, 0x6c <chksum_update+0x54> 40: 04
00 20 f8 bnezc $1, 0x54 <chksum_update+0x3c>
44: 03 f8 c3 7c dext $3, $6, 0x0, 0x20
<chksum_update+0x8> ; return le32_to_cpu(__get_unaligned_t(__le32, p));
; if (len & sizeof(u16)) { 44: 00 00 a3 8c lw $3, 0x0($5)
48: 02 00 61 30 andi $1, $3, 0x2 <chksum_init+0x2>
; CRC32(crc, value, w);
4c: 0c 00 20 f8 bnezc $1, 0x80 <chksum_update+0x68>
; if (len & sizeof(u8)) { 48: 8f 00 62 7c <unknown> crc32w
v0,v1,v0
50: 01 00 61 30 andi $1, $3, 0x1 <chksum_init+0x1>
54: 02 00 20 d8 beqzc $1, 0x60 <chksum_update+0x48>
; len -= sizeof(u32);
; CRC32(crc, value, b); 4c: fc ff c6 24 addiu $6, $6,
-0x4 <chksumc_digest+0xfffffffffffffd4c>
58: 00 00 a3 90 lbu $3, 0x0($5) ; p += sizeof(u32);
50: 04 00 a5 64 daddiu $5, $5, 0x4 <chksum_init+0x4>
5c: 0f 00 62 7c <unknown> crc32b v0,v1,v0 ; if (len &
sizeof(u16)) {
54: 03 f8 c3 7c dext $3, $6, 0x0, 0x20 <chksum_update+0x8>
; ctx->crc = crc32_mips_le_hw(ctx->crc, data, length); 58: 02 00 61
30 andi $1, $3, 0x2 <chksum_init+0x2>
60: 08 00 82 ac sw $2, 0x8($4) 5c: 03 00 20 d8
beqzc $1, 0x6c <chksum_update+0x54>
; return 0; ; CRC32(crc, value, h);
64: 09 00 e0 03 jr $ra 60: 00 00 a6 94 lhu $6, 0x0($5)
68: 00 00 02 64 daddiu $2, $zero, 0x0 <chksum_init>
; return le32_to_cpu(__get_unaligned_t(__le32, p)); 64: 4f 00 c2
7c <unknown> crc32h v0,a2,v0
6c: 00 00 a6 8c lw $6, 0x0($5)
; CRC32(crc, value, w); ; p += sizeof(u16);
70: 8f 00 c2 7c <unknown> crc32w v0,a2,v0 68: 02 00 a5
64 daddiu $5, $5, 0x2 <chksum_init+0x2>
; if (len & sizeof(u16)) { ; if (len & sizeof(u8)) {
74: 02 00 61 30 andi $1, $3, 0x2 <chksum_init+0x2> 6c: 01
00 61 30 andi $1, $3, 0x1 <chksum_init+0x1>
78: f5 ff 20 10 beqz $1, 0x50 <chksum_update+0x38> 70: 02
00 20 d8 beqzc $1, 0x7c <chksum_update+0x64>
7c: 04 00 a5 64 daddiu $5, $5, 0x4 <chksum_init+0x4>
; CRC32(crc, value, h); ; CRC32(crc, value, b);
80: 00 00 a6 94 lhu $6, 0x0($5) 74: 00 00 a3 90
lbu $3, 0x0($5)
84: 4f 00 c2 7c <unknown> crc32h v0,a2,v0 78: 0f 00 62
7c <unknown> crc32b v0,v1,v0
; if (len & sizeof(u8)) { ; ctx->crc =
crc32_mips_le_hw(ctx->crc, data, length);
88: 01 00 61 30 andi $1, $3, 0x1 <chksum_init+0x1> 7c: 08
00 82 ac sw $2, 0x8($4)
8c: f4 ff 20 10 beqz $1, 0x60 <chksum_update+0x48> ;
return 0;
90: 02 00 a5 64 daddiu $5, $5, 0x2 <chksum_init+0x2> 80:
09 00 e0 03 jr $ra
94: 00 00 00 08 j 0x0 <chksum_init> 84: 00 00 02 64
daddiu $2, $zero, 0x0 <chksum_init>
In our testing, this assignment operation affects Clang's code expansion
and instruction reordering.
This redundant assignment operation confuses Clang and prevents us from
obtaining optimized
assembly code.
I extracted the 'crc32_mips_le_hw()' function as a user-mode demo to
analyze the assembly code
generated for it on MIPS64.
Link: https://godbolt.org/z/r4dGbhTGf
As you can see, regardless of the Clang or GCC version, this redundant
operation affects the generated
assembly code.
>
> If it does make a difference, you should avoid doing ifdefs as they
> are more likely to cause build failures. Instead do something like
>
> if (!IS_ENABLED(CONFIG_64BIT))
> len -= sizeof(u32);
Okay, I'll send a Patch V2 to fix this and update the commit message
based on above.
>
>
> Cheers,
--
WangYuli <wangyuli@xxxxxxxxxxxxx>