On Thu, Nov 11, 2021 at 10:07:27AM +0100, Sebastian Andrzej Siewior wrote: > On 2021-11-10 10:54:32 [-0800], Minchan Kim wrote: > > The zsmalloc has used a bit for spin_lock in zpage handle to keep > > zpage object alive during several operations. However, it causes > > the problem for PREEMPT_RT as well as introducing too complicated. > > > > This patch replaces the bit spin_lock with pool->migrate_lock > > rwlock. It could make the code simple as well as zsmalloc work > > under PREEMPT_RT. > > > > The drawback is the pool->migrate_lock is bigger granuarity than > > per zpage lock so the contention would be higher than old when > > both IO-related operations(i.e., zsmalloc, zsfree, zs_[map|unmap]) > > and compaction(page/zpage migration) are going in parallel(*, > > the migrate_lock is rwlock and IO related functions are all read > > side lock so there is no contention). However, the write-side > > is fast enough(dominant overhead is just page copy) so it wouldn't > > affect much. If the lock granurity becomes more problem later, > > we could introduce table locks based on handle as a hash value. > > > > Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> > … > > index b8b098be92fa..5d4c4d254679 100644 > > --- a/mm/zsmalloc.c > > +++ b/mm/zsmalloc.c > > @@ -1789,6 +1767,11 @@ static void migrate_write_lock(struct zspage *zspage) > > write_lock(&zspage->lock); > > } > > > > +static void migrate_write_lock_nested(struct zspage *zspage) > > +{ > > + write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); > > I don't have this in my tree. I forgot it. I append it at the tail of the thread. I will also include it at nest revision. > > > +} > > + > > static void migrate_write_unlock(struct zspage *zspage) > > { > > write_unlock(&zspage->lock); > … > > @@ -2077,8 +2043,13 @@ static unsigned long __zs_compact(struct zs_pool *pool, > > struct zspage *dst_zspage = NULL; > > unsigned long pages_freed = 0; > > > > + /* protect the race between zpage migration and zs_free */ > > + write_lock(&pool->migrate_lock); > > + /* protect zpage allocation/free */ > > spin_lock(&class->lock); > > while ((src_zspage = isolate_zspage(class, true))) { > > + /* protect someone accessing the zspage(i.e., zs_map_object) */ > > + migrate_write_lock(src_zspage); > > > > if (!zs_can_compact(class)) > > break; > > @@ -2087,6 +2058,8 @@ static unsigned long __zs_compact(struct zs_pool *pool, > > cc.s_page = get_first_page(src_zspage); > > > > while ((dst_zspage = isolate_zspage(class, false))) { > > + migrate_write_lock_nested(dst_zspage); > > + > > cc.d_page = get_first_page(dst_zspage); > > /* > > * If there is no more space in dst_page, resched > > Looking at the these two chunks, the page here comes from a list, you > remove that page from that list and this ensures that you can't lock the > very same pages in reverse order as in: > > migrate_write_lock(dst_zspage); > … > migrate_write_lock(src_zspage); > > right? Sure. >From e0bfc5185bbd15c651a7a367b6d053b8c88b1e01 Mon Sep 17 00:00:00 2001 From: Minchan Kim <minchan@xxxxxxxxxx> Date: Tue, 19 Oct 2021 15:34:09 -0700 Subject: [PATCH] locking/rwlocks: introduce write_lock_nested Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> --- include/linux/rwlock.h | 6 ++++++ include/linux/rwlock_api_smp.h | 9 +++++++++ include/linux/spinlock_api_up.h | 1 + kernel/locking/spinlock.c | 6 ++++++ 4 files changed, 22 insertions(+) diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 7ce9a51ae5c0..93086de7bf9e 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -70,6 +70,12 @@ do { \ #define write_lock(lock) _raw_write_lock(lock) #define read_lock(lock) _raw_read_lock(lock) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define write_lock_nested(lock, subclass) _raw_write_lock_nested(lock, subclass) +#else +#define write_lock_nested(lock, subclass) _raw_write_lock(lock) +#endif + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define read_lock_irqsave(lock, flags) \ diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index abfb53ab11be..e0c866177c03 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -17,6 +17,7 @@ void __lockfunc _raw_read_lock(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock(rwlock_t *lock) __acquires(lock); +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) __acquires(lock); void __lockfunc _raw_read_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires(lock); @@ -46,6 +47,7 @@ _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) #ifdef CONFIG_INLINE_WRITE_LOCK #define _raw_write_lock(lock) __raw_write_lock(lock) +#define _raw_write_lock_nested(lock, subclass) __raw_write_lock_nested(lock, subclass) #endif #ifdef CONFIG_INLINE_READ_LOCK_BH @@ -211,6 +213,13 @@ static inline void __raw_write_lock(rwlock_t *lock) LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } +static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + preempt_disable(); + rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); +} + #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ static inline void __raw_write_unlock(rwlock_t *lock) diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index d0d188861ad6..b8ba00ccccde 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -59,6 +59,7 @@ #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) +#define _raw_write_lock_nested(lock, subclass) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) #define _raw_read_lock_bh(lock) __LOCK_BH(lock) #define _raw_write_lock_bh(lock) __LOCK_BH(lock) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index c5830cfa379a..22969ec69288 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -300,6 +300,12 @@ void __lockfunc _raw_write_lock(rwlock_t *lock) __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -- 2.34.0.rc1.387.gb447b232ab-goog