Re: [PATCH] locking/atomic/x86: Introduce try_cmpxchg64

Peter Zijlstra <peterz@xxxxxxxxxxxxx> · Fri, 13 May 2022 11:10:34 +0200

On Tue, May 10, 2022 at 05:42:17PM +0200, Uros Bizjak wrote:

For the Changelog I would focus on the 64bit improvement and leave 32bit
as a side-note.

> ---
>  arch/x86/include/asm/cmpxchg_32.h          | 43 ++++++++++++++++++++++
>  arch/x86/include/asm/cmpxchg_64.h          |  6 +++
>  include/linux/atomic/atomic-instrumented.h | 40 +++++++++++++++++++-
>  scripts/atomic/gen-atomic-instrumented.sh  |  2 +-
>  4 files changed, 89 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
> index 0a7fe0321613..e874ff7f7529 100644
> --- a/arch/x86/include/asm/cmpxchg_32.h
> +++ b/arch/x86/include/asm/cmpxchg_32.h
> @@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
>  #define arch_cmpxchg64_local(ptr, o, n)					\
>  	((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
>  					       (unsigned long long)(n)))
> +#define arch_try_cmpxchg64(ptr, po, n)					\
> +	((__typeof__(*(ptr)))__try_cmpxchg64((ptr), (unsigned long long *)(po), \
> +					     (unsigned long long)(n)))
>  #endif
>  
>  static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
> @@ -70,6 +73,25 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
>  	return prev;
>  }
>  
> +static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
> +{
> +	bool success;
> +	u64 prev;
> +	asm volatile(LOCK_PREFIX "cmpxchg8b %2"
> +		     CC_SET(z)
> +		     : CC_OUT(z) (success),
> +		       "=A" (prev),
> +		       "+m" (*ptr)
> +		     : "b" ((u32)new),
> +		       "c" ((u32)(new >> 32)),
> +		       "1" (*pold)
> +		     : "memory");
> +
> +	if (unlikely(!success))
> +		*pold = prev;

I would prefer this be more like the existing try_cmpxchg code,
perhaps:

	u64 old = *pold;

	asm volatile (LOCK_PREFIX "cmpxchg8b %[ptr]"
		      CC_SET(z)
		      : CC_OUT(z) (success),
		        [ptr] "+m" (*ptr)
		        "+A" (old)
		      : "b" ((u32)new)
		        "c" ((u32)(new >> 32))
		      : "memory");

	if (unlikely(!success))
		*pold = old;

The existing 32bit cmpxchg code is a 'bit' crusty.

> +	return success;
> +}
> +
>  #ifndef CONFIG_X86_CMPXCHG64
>  /*
>   * Building a kernel capable running on 80386 and 80486. It may be necessary
> @@ -108,6 +130,27 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
>  		       : "memory");				\
>  	__ret; })
>  
> +#define arch_try_cmpxchg64(ptr, po, n)				\
> +({								\
> +	bool success;						\
> +	__typeof__(*(ptr)) __prev;				\
> +	__typeof__(ptr) _old = (__typeof__(ptr))(po);		\
> +	__typeof__(*(ptr)) __old = *_old;			\
> +	__typeof__(*(ptr)) __new = (n);				\
> +	alternative_io(LOCK_PREFIX_HERE				\
> +			"call cmpxchg8b_emu",			\
> +			"lock; cmpxchg8b (%%esi)" ,		\
> +		       X86_FEATURE_CX8,				\
> +		       "=A" (__prev),				\
> +		       "S" ((ptr)), "0" (__old),		\
> +		       "b" ((unsigned int)__new),		\
> +		       "c" ((unsigned int)(__new>>32))		\
> +		       : "memory");				\
> +	success = (__prev == __old);				\
> +	if (unlikely(!success))					\
> +		*_old = __prev;					\
> +	likely(success);					\
> +})

Wouldn't this be better written like the normal fallback wrapper?

static __always_inline bool
arch_try_cmpxchg64(u64 *v, u64 *old, u64 new)
{
	u64 r, o = *old;
	r = arch_cmpxchg64(v, o, new);
	if (unlikely(r != o))
		*old = r;
	return likely(r == o);
}

Less magical, same exact code.