On Sat, 2022-09-10 at 04:36 +0800, Jiebin Sun wrote: > The batch size in percpu_counter_add_batch should be very large > in heavy writing and rare reading case. Add the "_local" version, > and mostly it will do local adding, reduce the global updating > and mitigate lock contention in writing. > > Signed-off-by: Jiebin Sun <jiebin.sun@xxxxxxxxx> > --- > include/linux/percpu_counter.h | 38 ++++++++++++++++++++++++++++++++++ > 1 file changed, 38 insertions(+) > > diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h > index 01861eebed79..6dd7eaba8527 100644 > --- a/include/linux/percpu_counter.h > +++ b/include/linux/percpu_counter.h > @@ -15,6 +15,9 @@ > #include <linux/types.h> > #include <linux/gfp.h> > > +/* percpu_counter batch for local add or sub */ > +#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX > + > #ifdef CONFIG_SMP > > struct percpu_counter { > @@ -56,6 +59,27 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) > percpu_counter_add_batch(fbc, amount, percpu_counter_batch); > } > > +/* > + * Use this function in heavy writing but rare reading case. The large > + * batch size will reduce the global updating. Suggest revising the comment, so it is clear we need to use percpu_counter_sum() to access the counter: With percpu_counter_add_local() and percpu_counter_sub_local(), counts are accumulated in local per cpu counter and not in fbc->count until local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter write efficient. But percpu_counter_sum(), instead of percpu_counter_read(), needs to be used to add up the counts from each CPU to account for all the local counts. So percpu_counter_add_local() and percpu_counter_sub_local() should be used when a counter is updated frequently and read rarely. > + */ > +static inline void > +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) > +{ > + percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); > +} > + > +/* > + * Similar with percpu_counter_add_local, use it in heavy writing but > + * rare reading case. The large batch size will reduce the global > + * updating. > + */ > +static inline void > +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) > +{ > + percpu_counter_add_batch(fbc, -amount, PERCPU_COUNTER_LOCAL_BATCH); > +} > + > static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) > { > s64 ret = __percpu_counter_sum(fbc); > @@ -138,6 +162,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) > preempt_enable(); > } > > +/* no smp percpu_counter_add_local is the same with percpu_counter_add */ > +static inline void > +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) > +{ > + percpu_counter_add(fbc, amount); > +} > + > +/* no smp percpu_counter_sub_local is the same with percpu_counter_sub */ > +static inline void > +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) > +{ > + percpu_counter_sub(fbc, amount); > +} > + > static inline void > percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) > {