On Thu, Feb 20, 2025 at 07:18:40PM +0000, Yosry Ahmed wrote: > On Fri, Feb 14, 2025 at 01:50:23PM +0900, Sergey Senozhatsky wrote: > > In order to implement preemptible object mapping we need a zspage lock > > that satisfies several preconditions: > > - it should be reader-write type of a lock > > - it should be possible to hold it from any context, but also being > > preemptible if the context allows it > > - we never sleep while acquiring but can sleep while holding in read > > mode > > > > An rwsemaphore doesn't suffice, due to atomicity requirements, rwlock > > doesn't satisfy due to reader-preemptability requirement. It's also > > worth to mention, that per-zspage rwsem is a little too memory heavy > > (we can easily have double digits megabytes used only on rwsemaphores). > > > > Switch over from rwlock_t to a atomic_t-based implementation of a > > reader-writer semaphore that satisfies all of the preconditions. > > > > The spin-lock based zspage_lock is suggested by Hillf Danton. > > > > Suggested-by: Hillf Danton <hdanton@xxxxxxxx> > > Signed-off-by: Sergey Senozhatsky <senozhatsky@xxxxxxxxxxxx> > > --- > > mm/zsmalloc.c | 246 +++++++++++++++++++++++++++++++++++++++----------- > > 1 file changed, 192 insertions(+), 54 deletions(-) > > > > diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c > > index 2e338cde0d21..bc679a3e1718 100644 > > --- a/mm/zsmalloc.c > > +++ b/mm/zsmalloc.c > > @@ -226,6 +226,9 @@ struct zs_pool { > > /* protect zspage migration/compaction */ > > rwlock_t lock; > > atomic_t compaction_in_progress; > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC > > + struct lock_class_key lock_class; > > +#endif > > }; > > > > static inline void zpdesc_set_first(struct zpdesc *zpdesc) > > @@ -257,6 +260,18 @@ static inline void free_zpdesc(struct zpdesc *zpdesc) > > __free_page(page); > > } > > > > +#define ZS_PAGE_UNLOCKED 0 > > +#define ZS_PAGE_WRLOCKED -1 > > + > > +struct zspage_lock { > > + spinlock_t lock; > > + int cnt; > > + > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC > > + struct lockdep_map dep_map; > > +#endif > > +}; > > + > > struct zspage { > > struct { > > unsigned int huge:HUGE_BITS; > > @@ -269,7 +284,7 @@ struct zspage { > > struct zpdesc *first_zpdesc; > > struct list_head list; /* fullness list */ > > struct zs_pool *pool; > > - rwlock_t lock; > > + struct zspage_lock zsl; > > }; > > > > struct mapping_area { > > @@ -279,6 +294,148 @@ struct mapping_area { > > enum zs_mapmode vm_mm; /* mapping mode */ > > }; > > > > +static void zspage_lock_init(struct zspage *zspage) > > +{ > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC > > + lockdep_init_map(&zspage->zsl.dep_map, "zspage->lock", > > + &zspage->pool->lock_class, 0); > > +#endif > > + > > + spin_lock_init(&zspage->zsl.lock); > > + zspage->zsl.cnt = ZS_PAGE_UNLOCKED; > > +} > > + > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC > > Instead of the #ifdef and repeating all the functions, can't we do > something like: > > #ifdef CONFIG_DEBUG_LOCK_ALLOC > #define zspage_lock_map(zsl) (&zsl->dep_map) > #else > #define zspage_lock_map(zsl) /* empty or NULL */ > #endif > > Then we can just have one version of the functions and use > zspage_lock_map() instead of zsl->dep_map, right? > > > +static inline void __read_lock(struct zspage *zspage) > > +{ > > + struct zspage_lock *zsl = &zspage->zsl; > > + > > + rwsem_acquire_read(&zsl->dep_map, 0, 0, _RET_IP_); > > + > > + spin_lock(&zsl->lock); > > + zsl->cnt++; > > Shouldn't we check if the lock is write locked? Never mind we keep the spinlock held on write locking.