Hi Lai, Just a few comments about your v2 proposal. Hopefully you'll catch these before you send out v3 :) - I would prefer reader_refcnt to be unsigned int instead of unsigned long - I would like some comment to indicate that lgrwlocks don't have reader-writer fairness and are thus somewhat discouraged (people could use plain lglock if they don't need reader preference, though even that use (as brlock) is discouraged already :) - I don't think FALLBACK_BASE is necessary (you already mentioned you'd drop it) - I prefer using the fallback_rwlock's dep_map for lockdep tracking. I feel this is more natural since we want the lgrwlock to behave as the rwlock, not as the lglock. - I prefer to avoid return statements in the middle of functions when it's easyto do so. Attached is my current version (based on an earlier version of your code). You don't have to take it as is but I feel it makes for a more concrete suggestion :) Thanks, ----------------------------8<------------------------------------------- lglock: add read-preference lgrwlock Current lglock may be used as a fair rwlock; however sometimes a read-preference rwlock is preferred. One such use case recently came up for get_cpu_online_atomic(). This change adds a new lgrwlock with the following properties: - high performance read side, using only cpu-local structures when there is no write side to contend with; - correctness guarantees similar to rwlock_t: recursive readers are allowed and the lock's read side is not ordered vs other locks; - low performance write side (comparable to lglocks' global side). The implementation relies on the following principles: - reader_refcnt is a local lock count; it indicates how many recursive read locks are taken using the local lglock; - lglock is used by readers for local locking; it must be acquired before reader_refcnt becomes nonzero and released after reader_refcnt goes back to zero; - fallback_rwlock is used by readers for global locking; it is acquired when fallback_reader_refcnt is zero and the trylock fails on lglock. - writers take both the lglock write side and the fallback_rwlock, thus making sure to exclude both local and global readers. Thanks to Srivatsa S. Bhat for proposing a lock with these requirements and Lai Jiangshan for proposing this algorithm as an lglock extension. Signed-off-by: Michel Lespinasse <walken@xxxxxxxxxx> --- include/linux/lglock.h | 46 +++++++++++++++++++++++++++++++++++++++ kernel/lglock.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/include/linux/lglock.h b/include/linux/lglock.h index 0d24e932db0b..8b59084935d5 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -67,4 +67,50 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu); void lg_global_lock(struct lglock *lg); void lg_global_unlock(struct lglock *lg); +/* + * lglock may be used as a read write spinlock if desired (though this is + * not encouraged as the write side scales badly on high CPU count machines). + * It has reader/writer fairness when used that way. + * + * However, sometimes it is desired to have an unfair rwlock instead, with + * reentrant readers that don't need to be ordered vs other locks, comparable + * to rwlock_t. lgrwlock implements such semantics. + */ +struct lgrwlock { + unsigned int __percpu *reader_refcnt; + struct lglock lglock; + rwlock_t fallback_rwlock; +}; + +#define __DEFINE_LGRWLOCK_PERCPU_DATA(name) \ + static DEFINE_PER_CPU(unsigned int, name ## _refcnt); \ + static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \ + = __ARCH_SPIN_LOCK_UNLOCKED; + +#define __LGRWLOCK_INIT(name) { \ + .reader_refcnt = &name ## _refcnt, \ + .lglock = { .lock = &name ## _lock }, \ + .fallback_rwlock = __RW_LOCK_UNLOCKED(name.fallback_rwlock) \ +} + +#define DEFINE_LGRWLOCK(name) \ + __DEFINE_LGRWLOCK_PERCPU_DATA(name) \ + struct lgrwlock name = __LGRWLOCK_INIT(name) + +#define DEFINE_STATIC_LGRWLOCK(name) \ + __DEFINE_LGRWLOCK_PERCPU_DATA(name) \ + static struct lgrwlock name = __LGRWLOCK_INIT(name) + +static inline void lg_rwlock_init(struct lgrwlock *lgrw, char *name) +{ + lg_lock_init(&lgrw->lglock, name); +} + +void lg_read_lock(struct lgrwlock *lgrw); +void lg_read_unlock(struct lgrwlock *lgrw); +void lg_write_lock(struct lgrwlock *lgrw); +void lg_write_unlock(struct lgrwlock *lgrw); +void __lg_read_write_lock(struct lgrwlock *lgrw); +void __lg_read_write_unlock(struct lgrwlock *lgrw); + #endif diff --git a/kernel/lglock.c b/kernel/lglock.c index 86ae2aebf004..e78a7c95dbfd 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c @@ -87,3 +87,61 @@ void lg_global_unlock(struct lglock *lg) preempt_enable(); } EXPORT_SYMBOL(lg_global_unlock); + +void lg_read_lock(struct lgrwlock *lgrw) +{ + preempt_disable(); + + if (__this_cpu_read(*lgrw->reader_refcnt) || + arch_spin_trylock(this_cpu_ptr(lgrw->lglock.lock))) { + __this_cpu_inc(*lgrw->reader_refcnt); + rwlock_acquire_read(&lgrw->fallback_rwlock.dep_map, + 0, 0, _RET_IP_); + } else { + read_lock(&lgrw->fallback_rwlock); + } +} +EXPORT_SYMBOL(lg_read_lock); + +void lg_read_unlock(struct lgrwlock *lgrw) +{ + if (likely(__this_cpu_read(*lgrw->reader_refcnt))) { + rwlock_release(&lgrw->fallback_rwlock.dep_map, + 1, _RET_IP_); + if (!__this_cpu_dec_return(*lgrw->reader_refcnt)) + arch_spin_unlock(this_cpu_ptr(lgrw->lglock.lock)); + } else { + read_unlock(&lgrw->fallback_rwlock); + } + + preempt_enable(); +} +EXPORT_SYMBOL(lg_read_unlock); + +void lg_write_lock(struct lgrwlock *lgrw) +{ + lg_global_lock(&lgrw->lglock); + write_lock(&lgrw->fallback_rwlock); +} +EXPORT_SYMBOL(lg_write_lock); + +void lg_write_unlock(struct lgrwlock *lgrw) +{ + write_unlock(&lgrw->fallback_rwlock); + lg_global_unlock(&lgrw->lglock); +} +EXPORT_SYMBOL(lg_write_unlock); + +void __lg_read_write_lock(struct lgrwlock *lgrw) +{ + lg_write_lock(lgrw); + __this_cpu_write(*lgrw->reader_refcnt, 1); +} +EXPORT_SYMBOL(__lg_read_write_lock); + +void __lg_read_write_unlock(struct lgrwlock *lgrw) +{ + __this_cpu_write(*lgrw->reader_refcnt, 0); + lg_write_unlock(lgrw); +} +EXPORT_SYMBOL(__lg_read_write_unlock); -- Michel "Walken" Lespinasse A program is never fully debugged until the last user dies. -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html