On 12/11/18 11:44 PM, Akira Yokosawa wrote: > From 61a010b7c34c1b6c05b3883637ccdf2438409408 Mon Sep 17 00:00:00 2001 > From: Akira Yokosawa <akiyks@xxxxxxxxx> > Date: Tue, 11 Dec 2018 21:14:31 +0900 > Subject: [PATCH 4/4] EXP CodeSamples: Add weak variant of cmpxchg() as cmpxchg_weak() > > For comparison, add weak (spurious-failure permitting) ones in the > names of cmpxchg_weak() and atomic_cmpxchg_weak(). > count_lim_atomic_weak.c is a copy of count_lim_atomic.c but uses > atomic_cmpxchg_weak(). > > Signed-off-by: Akira Yokosawa <akiyks@xxxxxxxxx> > --- > CodeSamples/api-pthreads/api-gcc.h | 14 ++ > CodeSamples/count/Makefile | 4 + > CodeSamples/count/count_lim_atomic_weak.c | 226 ++++++++++++++++++++++++++++++ > 3 files changed, 244 insertions(+) > create mode 100644 CodeSamples/count/count_lim_atomic_weak.c > > diff --git a/CodeSamples/api-pthreads/api-gcc.h b/CodeSamples/api-pthreads/api-gcc.h > index b66f4b9..b27facd 100644 > --- a/CodeSamples/api-pthreads/api-gcc.h > +++ b/CodeSamples/api-pthreads/api-gcc.h > @@ -178,6 +178,20 @@ static __inline__ int atomic_cmpxchg(atomic_t *v, int old, int new) > return cmpxchg(&v->counter, old, new); > } > > +#define cmpxchg_weak(ptr, o, n) \ > +({ \ > + typeof(*ptr) _____actual = (o); \ > + typeof(*ptr) _____o = _____actual; \ > + \ > + __atomic_compare_exchange_n((ptr), (void *)&_____actual, (n), 1, \ > + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED) ? _____o : _____o+1 ; \ > +}) > + Hi Akira, Quit an interesting topic. I did some experiments and please see below. We changed the implementation of cmpxchg() heavily. So let's use the latest two versions---count_lim_atomic based on your latest patch [3/4] and count_lim_atomic_weak based on patch [4/4]---to compare apple to apple. I ran these two programs on an 8-core PPC server. The duration of each run is set to 2.4 seconds. The first observation is that the result reported in your last mail (patch [0/4]) is correct; for each update operation, count_lim_atomic is about 10 nanoseconds faster than count_lim_atomic_weak, no matter how many concurrent updaters are running. For example, when I created 64 updaters, I got the follow results: ./count_lim_atomic 64 uperf ns/update: 290 ./count_lim_atomic_weak 64 uperf ns/update: 301 Following is my analysis. I found that the first reason is because cmpxchg_weak is using a temporal variable _____o to avoid side-effects of (o). If we switch cmpxchg_weak to the older version (shown below. I know it has flaws, but let's forget about it temporarily :-)), count_lim_atomic_weak runs as fast as count_lim_atomic. #define cmpxchg_weak(ptr, o, n) \ ({ \ typeof(*ptr) _____actual = (o); \ \ __atomic_compare_exchange_n((ptr), (void *)&_____actual, (n), 1, \ __ATOMIC_SEQ_CST, __ATOMIC_RELAXED) ? (o) : (n) ; \ }) The second possible reason could be more interesting. I discussed that in the reply to your mail [Patch 3/4]. Regards, --Junchang > +static __inline__ int atomic_cmpxchg_weak(atomic_t *v, int old, int new) > +{ > + return cmpxchg_weak(&v->counter, old, new); > +} > + > #define xchg(ptr, v) __atomic_exchange_n((ptr), (v), __ATOMIC_SEQ_CST) > #define atomic_xchg(ptr, v) \ > __atomic_exchange_n(&(ptr)->counter, (v), __ATOMIC_SEQ_CST) > diff --git a/CodeSamples/count/Makefile b/CodeSamples/count/Makefile > index 481eb3f..52eb466 100644 > --- a/CodeSamples/count/Makefile > +++ b/CodeSamples/count/Makefile > @@ -22,6 +22,7 @@ PROGS = count_atomic \ > count_lim \ > count_lim_app \ > count_lim_atomic \ > + count_lim_atomic_weak \ > count_lim_sig \ > count_limd \ > count_nonatomic \ > @@ -65,6 +66,9 @@ count_lim_app: count_lim_app.c ../api.h limtorture.h > count_lim_atomic: count_lim_atomic.c ../api.h limtorture.h > $(CC) $(GCC_ARGS) $(CFLAGS) -o count_lim_atomic count_lim_atomic.c -lpthread > > +count_lim_atomic_weak: count_lim_atomic_weak.c ../api.h limtorture.h > + $(CC) $(GCC_ARGS) $(CFLAGS) -o count_lim_atomic_weak count_lim_atomic_weak.c -lpthread > + > count_lim_sig: count_lim_sig.c ../api.h limtorture.h > $(CC) $(GCC_ARGS) $(CFLAGS) -o count_lim_sig count_lim_sig.c -lpthread > > diff --git a/CodeSamples/count/count_lim_atomic_weak.c b/CodeSamples/count/count_lim_atomic_weak.c > new file mode 100644 > index 0000000..42953f2 > --- /dev/null > +++ b/CodeSamples/count/count_lim_atomic_weak.c > @@ -0,0 +1,226 @@ > +/* > + * count_lim_atomic_weak.c: simple limit counter that uses weak variant > + * of atomic operations to steal from other threads. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, you can access it online at > + * http://www.gnu.org/licenses/gpl-2.0.html. > + * > + * Copyright (c) 2009 Paul E. McKenney, IBM Corporation. > + */ > + > +#include "../api.h" > + > +static void balance_count(void); > + > +atomic_t __thread counterandmax = ATOMIC_INIT(0); //\lnlbl{var:candmax} > +unsigned long globalcountmax = 1 << 25; //\lnlbl{var:def:b} > +unsigned long globalcount = 0; > +unsigned long globalreserve = 0; > +atomic_t *counterp[NR_THREADS] = { NULL }; > +DEFINE_SPINLOCK(gblcnt_mutex); //\lnlbl{var:def:e} > +#define CM_BITS (sizeof(atomic_t) * 4) //\lnlbl{var:CM_BITS} > +#define MAX_COUNTERMAX ((1 << CM_BITS) - 1) //\lnlbl{var:MAX_CMAX} > + > +static __inline__ void //\lnlbl{split_int:b} > +split_counterandmax_int(int cami, int *c, int *cm) > +{ > + *c = (cami >> CM_BITS) & MAX_COUNTERMAX; //\lnlbl{split_int:msh} > + *cm = cami & MAX_COUNTERMAX; //\lnlbl{split_int:lsh} > +} //\lnlbl{split_int:e} > + > +static __inline__ void //\lnlbl{split:b} > +split_counterandmax(atomic_t *cam, int *old, int *c, int *cm)//\lnlbl{split:func} > +{ > + unsigned int cami = atomic_read(cam); //\lnlbl{split:int} > + > + *old = cami; //\lnlbl{split:old} > + split_counterandmax_int(cami, c, cm); //\lnlbl{split:split_int} > +} //\lnlbl{split:e} > + > +static __inline__ int merge_counterandmax(int c, int cm)//\lnlbl{merge:b} > +{ > + unsigned int cami; > + > + cami = (c << CM_BITS) | cm; //\lnlbl{merge:merge} > + return ((int)cami); > +} //\lnlbl{merge:e} > + > +static void globalize_count(void) //\lnlbl{globalize:b} > +{ > + int c; > + int cm; > + int old; > + > + split_counterandmax(&counterandmax, &old, &c, &cm);//\lnlbl{globalize:split} > + globalcount += c; > + globalreserve -= cm; > + old = merge_counterandmax(0, 0); > + atomic_set(&counterandmax, old); > +} //\lnlbl{globalize:e} > + > +static void flush_local_count(void) //\lnlbl{flush:b} > +{ > + int c; > + int cm; > + int old; > + int t; > + int zero; > + > + if (globalreserve == 0) //\lnlbl{flush:checkrsv} > + return; //\lnlbl{flush:return:n} > + zero = merge_counterandmax(0, 0); //\lnlbl{flush:initzero} > + for_each_thread(t) //\lnlbl{flush:loop:b} > + if (counterp[t] != NULL) { //\lnlbl{flush:checkp} > + old = atomic_xchg(counterp[t], zero);//\lnlbl{flush:atmxchg} > + split_counterandmax_int(old, &c, &cm);//\lnlbl{flush:split} > + globalcount += c; //\lnlbl{flush:glbcnt} > + globalreserve -= cm; //\lnlbl{flush:glbrsv} > + } //\lnlbl{flush:loop:e} > +} //\lnlbl{flush:e} > + > +int add_count(unsigned long delta) //\lnlbl{add:b} > +{ > + int c; > + int cm; > + int old; > + int new; > + > + do { //\lnlbl{add:fast:b} > + split_counterandmax(&counterandmax, &old, &c, &cm);//\lnlbl{add:split} > + if (delta > MAX_COUNTERMAX || c + delta > cm)//\lnlbl{add:check} > + goto slowpath; //\lnlbl{add:goto} > + new = merge_counterandmax(c + delta, cm);//\lnlbl{add:merge} > + } while (atomic_cmpxchg_weak(&counterandmax, //\lnlbl{add:atmcmpex} > + old, new) != old); //\lnlbl{add:loop:e} > + return 1; //\lnlbl{add:return:fs} > +slowpath: //\lnlbl{add:slow:b} > + spin_lock(&gblcnt_mutex); //\lnlbl{add:acquire} > + globalize_count(); //\lnlbl{add:globalize} > + if (globalcountmax - globalcount - //\lnlbl{add:checkglb:b} > + globalreserve < delta) { //\lnlbl{add:checkglb:e} > + flush_local_count(); //\lnlbl{add:flush} > + if (globalcountmax - globalcount - //\lnlbl{add:checkglb:nb} > + globalreserve < delta) { //\lnlbl{add:checkglb:ne} > + spin_unlock(&gblcnt_mutex); //\lnlbl{add:release:f} > + return 0; //\lnlbl{add:return:sf} > + } > + } > + globalcount += delta; //\lnlbl{add:addglb} > + balance_count(); //\lnlbl{add:balance} > + spin_unlock(&gblcnt_mutex); //\lnlbl{add:release:s} > + return 1; //\lnlbl{add:return:ss} > +} //\lnlbl{add:e} > + > +int sub_count(unsigned long delta) //\lnlbl{sub:b} > +{ > + int c; > + int cm; > + int old; > + int new; > + > + do { //\lnlbl{sub:fast:b} > + split_counterandmax(&counterandmax, &old, &c, &cm); > + if (delta > c) > + goto slowpath; > + new = merge_counterandmax(c - delta, cm); > + } while (atomic_cmpxchg_weak(&counterandmax, > + old, new) != old); > + return 1; //\lnlbl{sub:fast:e} > + slowpath: //\lnlbl{sub:slow:b} > + spin_lock(&gblcnt_mutex); > + globalize_count(); > + if (globalcount < delta) { > + flush_local_count(); > + if (globalcount < delta) { > + spin_unlock(&gblcnt_mutex); > + return 0; > + } > + } > + globalcount -= delta; > + balance_count(); > + spin_unlock(&gblcnt_mutex); > + return 1; //\lnlbl{sub:slow:e} > +} //\lnlbl{sub:e} > + > +unsigned long read_count(void) //\lnlbl{b} > +{ > + int c; > + int cm; > + int old; > + int t; > + unsigned long sum; > + > + spin_lock(&gblcnt_mutex); //\lnlbl{acquire} > + sum = globalcount; //\lnlbl{initsum} > + for_each_thread(t) //\lnlbl{loop:b} > + if (counterp[t] != NULL) { > + split_counterandmax(counterp[t], &old, &c, &cm);//\lnlbl{split} > + sum += c; > + } //\lnlbl{loop:e} > + spin_unlock(&gblcnt_mutex); //\lnlbl{release} > + return sum; //\lnlbl{return} > +} //\lnlbl{e} > + > +void count_init(void) > +{ > +} > + > +static void balance_count(void) //\lnlbl{balance:b} > +{ > + int c; > + int cm; > + int old; > + unsigned long limit; > + > + limit = globalcountmax - globalcount - > + globalreserve; > + limit /= num_online_threads(); > + if (limit > MAX_COUNTERMAX) > + cm = MAX_COUNTERMAX; > + else > + cm = limit; > + globalreserve += cm; > + c = cm / 2; > + if (c > globalcount) > + c = globalcount; > + globalcount -= c; > + old = merge_counterandmax(c, cm); > + atomic_set(&counterandmax, old); //\lnlbl{balance:atmcset} > +} //\lnlbl{balance:e} > + > +void count_register_thread(void) //\lnlbl{register:b} > +{ > + int idx = smp_thread_id(); > + > + spin_lock(&gblcnt_mutex); > + counterp[idx] = &counterandmax; > + spin_unlock(&gblcnt_mutex); > +} > + > +void count_unregister_thread(int nthreadsexpected) //\lnlbl{unregister:b} > +{ > + int idx = smp_thread_id(); > + > + spin_lock(&gblcnt_mutex); > + globalize_count(); > + counterp[idx] = NULL; > + spin_unlock(&gblcnt_mutex); > +} > + > +void count_cleanup(void) > +{ > +} > + > +#define NEED_REGISTER_THREAD > +#include "limtorture.h" >