Re: [PATCH v3 05/20] hash: convert `oidcmp()` and `oideq()` to compare whole hash

Karthik Nayak <karthik.188@xxxxxxxxx> · Mon, 17 Jun 2024 02:18:16 -0700

Patrick Steinhardt <ps@xxxxxx> writes:

> With the preceding commit, the hash array of object IDs is now fully
> zero-padded even when the hash algorithm's output is smaller than the
> array length. With that, we can now adapt both `oidcmp()` and `oideq()`
> to unconditionally memcmp(3P) the whole array instead of depending on
> the hash size.
>
> While it may feel inefficient to compare unused bytes for e.g. SHA-1, in
> practice the compiler should now be able to produce code that is better
> optimized both because we have no branch anymore, but also because the
> size to compare is now known at compile time. Goldbolt spits out the
> following assembly on an x86_64 platform with GCC 14.1 for the old and
> new implementations of `oidcmp()`:
>
>     oidcmp_old:
>             movsx   rax, DWORD PTR [rdi+32]
>             test    eax, eax
>             jne     .L2
>             mov     rax, QWORD PTR the_repository[rip]
>             cmp     QWORD PTR [rax+16], 32
>             je      .L6
>     .L4:
>             mov     edx, 20
>             jmp     memcmp
>     .L2:
>             lea     rdx, [rax+rax*2]
>             lea     rax, [rax+rdx*4]
>             lea     rax, hash_algos[0+rax*8]
>             cmp     QWORD PTR [rax+16], 32
>             jne     .L4
>     .L6:
>             mov     edx, 32
>             jmp     memcmp
>
>     oidcmp_new:
>             mov     edx, 32
>             jmp     memcmp
>
> The new implementation gets ridi of all the branches and effectively

s/ridi/rid

> only ends setting up `edx` for `memcmp()` and then calling it.
>
> And for `oideq()`:
>
>     oideq_old:
>             movsx   rcx, DWORD PTR [rdi+32]
>             mov     rax, rdi
>             mov     rdx, rsi
>             test    ecx, ecx
>             jne     .L2
>             mov     rcx, QWORD PTR the_repository[rip]
>             cmp     QWORD PTR [rcx+16], 32
>             mov     rcx, QWORD PTR [rax]
>             je      .L12
>     .L4:
>             mov     rsi, QWORD PTR [rax+8]
>             xor     rcx, QWORD PTR [rdx]
>             xor     rsi, QWORD PTR [rdx+8]
>             or      rcx, rsi
>             je      .L13
>     .L8:
>             mov     eax, 1
>             test    eax, eax
>             sete    al
>             movzx   eax, al
>             ret
>     .L2:
>             lea     rsi, [rcx+rcx*2]
>             lea     rcx, [rcx+rsi*4]
>             lea     rcx, hash_algos[0+rcx*8]
>             cmp     QWORD PTR [rcx+16], 32
>             mov     rcx, QWORD PTR [rax]
>             jne     .L4
>     .L12:
>             mov     rsi, QWORD PTR [rax+8]
>             xor     rcx, QWORD PTR [rdx]
>             xor     rsi, QWORD PTR [rdx+8]
>             or      rcx, rsi
>             jne     .L8
>             mov     rcx, QWORD PTR [rax+16]
>             mov     rax, QWORD PTR [rax+24]
>             xor     rcx, QWORD PTR [rdx+16]
>             xor     rax, QWORD PTR [rdx+24]
>             or      rcx, rax
>             jne     .L8
>             xor     eax, eax
>     .L14:
>             test    eax, eax
>             sete    al
>             movzx   eax, al
>             ret
>     .L13:
>             mov     edi, DWORD PTR [rdx+16]
>             cmp     DWORD PTR [rax+16], edi
>             jne     .L8
>             xor     eax, eax
>             jmp     .L14
>
>     oideq_new:
>             mov     rax, QWORD PTR [rdi]
>             mov     rdx, QWORD PTR [rdi+8]
>             xor     rax, QWORD PTR [rsi]
>             xor     rdx, QWORD PTR [rsi+8]
>             or      rax, rdx
>             je      .L5
>     .L2:
>             mov     eax, 1
>             xor     eax, 1
>             ret
>     .L5:
>             mov     rax, QWORD PTR [rdi+16]
>             mov     rdx, QWORD PTR [rdi+24]
>             xor     rax, QWORD PTR [rsi+16]
>             xor     rdx, QWORD PTR [rsi+24]
>             or      rax, rdx
>             jne     .L2
>             xor     eax, eax
>             xor     eax, 1
>             ret
>
> Interestingly, the compiler decides to split the comparisons into two so
> that it first compares the lower half of the object ID for equality and
> then the upper half. If the first check shows a difference, then we
> wouldn't even end up comparing the second half.
>
> In both cases, the new generated code is significantly shorter and has
> way less branches. While I didn't benchmark the change, I'd be surprised
> if the new code was slower.
>

This was nice to read, thanks for adding the ASM here.

> Signed-off-by: Patrick Steinhardt <ps@xxxxxx>
> ---
>  hash-ll.h | 10 ++++++++++
>  hash.h    | 20 --------------------
>  2 files changed, 10 insertions(+), 20 deletions(-)
>
> diff --git a/hash-ll.h b/hash-ll.h
> index b72f84f4ae..b04fe12aef 100644
> --- a/hash-ll.h
> +++ b/hash-ll.h
> @@ -278,6 +278,16 @@ static inline void hashclr(unsigned char *hash, const struct git_hash_algo *algo
>  	memset(hash, 0, algop->rawsz);
>  }
>
> +static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2)
> +{
> +	return memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
> +}
> +
> +static inline int oideq(const struct object_id *oid1, const struct object_id *oid2)
> +{
> +	return !memcmp(oid1->hash, oid2->hash, GIT_MAX_RAWSZ);
> +}
> +
>  static inline void oidcpy(struct object_id *dst, const struct object_id *src)
>  {
>  	memcpy(dst->hash, src->hash, GIT_MAX_RAWSZ);
> diff --git a/hash.h b/hash.h
> index e43e3d8b5a..ddc2e5ca47 100644
> --- a/hash.h
> +++ b/hash.h
> @@ -6,26 +6,6 @@
>
>  #define the_hash_algo the_repository->hash_algo
>
> -static inline int oidcmp(const struct object_id *oid1, const struct object_id *oid2)
> -{
> -	const struct git_hash_algo *algop;
> -	if (!oid1->algo)
> -		algop = the_hash_algo;
> -	else
> -		algop = &hash_algos[oid1->algo];
> -	return hashcmp(oid1->hash, oid2->hash, algop);
> -}
> -
> -static inline int oideq(const struct object_id *oid1, const struct object_id *oid2)
> -{
> -	const struct git_hash_algo *algop;
> -	if (!oid1->algo)
> -		algop = the_hash_algo;
> -	else
> -		algop = &hash_algos[oid1->algo];
> -	return hasheq(oid1->hash, oid2->hash, algop);
> -}
> -
>  static inline int is_null_oid(const struct object_id *oid)
>  {
>  	return oideq(oid, null_oid());
> --
> 2.45.2.457.g8d94cfb545.dirty
Attachment:
signature.asc

Description: PGP signature