On Sun, Nov 17, 2024 at 12:09:28AM -0800, Suren Baghdasaryan wrote: > Back when per-vma locks were introduces, vm_lock was moved out of > vm_area_struct in [1] because of the performance regression caused by > false cacheline sharing. Recent investigation [2] revealed that the > regressions is limited to a rather old Broadwell microarchitecture and > even there it can be mitigated by disabling adjacent cacheline > prefetching, see [3]. > Splitting single logical structure into multiple ones leads to more > complicated management, extra pointer dereferences and overall less > maintainable code. When that split-away part is a lock, it complicates > things even further. With no performance benefits, there are no reasons > for this split. Merging the vm_lock back into vm_area_struct also allows > vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. > Move vm_lock back into vm_area_struct, aligning it at the cacheline > boundary and changing the cache to be cacheline-aligned as well. Thanks! > With kernel compiled using defconfig, this causes VMA memory consumption > to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes: > > slabinfo before: > <name> ... <objsize> <objperslab> <pagesperslab> : ... > vma_lock ... 40 102 1 : ... > vm_area_struct ... 160 51 2 : ... > > slabinfo after moving vm_lock: > <name> ... <objsize> <objperslab> <pagesperslab> : ... > vm_area_struct ... 256 32 2 : ... > > Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages, > which is 5.5MB per 100000 VMAs. Note that the size of this structure is > dependent on the kernel configuration and typically the original size is > higher than 160 bytes. Therefore these calculations are close to the > worst case scenario. A more realistic vm_area_struct usage before this > change is: > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > vma_lock ... 40 102 1 : ... > vm_area_struct ... 176 46 2 : ... > > Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages, > which is 3.9MB per 100000 VMAs. > This memory consumption growth can be addressed later by optimizing the > vm_lock. > > [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@xxxxxxxxxx/ > [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/ > [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@xxxxxxxxxxxxxx/ > > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> LGTM, and briefly tested in VM and looking good so: Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > --- > include/linux/mm.h | 28 ++++++++++-------- > include/linux/mm_types.h | 6 ++-- > kernel/fork.c | 49 ++++---------------------------- > tools/testing/vma/vma_internal.h | 33 +++++---------------- > 4 files changed, 32 insertions(+), 84 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 1ba2e480ae63..737c003b0a1e 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -684,6 +684,12 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} > #endif /* CONFIG_NUMA_BALANCING */ > > #ifdef CONFIG_PER_VMA_LOCK > +static inline void vma_lock_init(struct vm_area_struct *vma) > +{ > + init_rwsem(&vma->vm_lock.lock); > + vma->vm_lock_seq = UINT_MAX; > +} > + > /* > * Try to read-lock a vma. The function is allowed to occasionally yield false > * locked result to avoid performance overhead, in which case we fall back to > @@ -701,7 +707,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) > if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) > return false; > > - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) > + if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) > return false; > > /* > @@ -716,7 +722,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) > * This pairs with RELEASE semantics in vma_end_write_all(). > */ > if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { > - up_read(&vma->vm_lock->lock); > + up_read(&vma->vm_lock.lock); > return false; > } > return true; > @@ -731,7 +737,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) > static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) > { > mmap_assert_locked(vma->vm_mm); > - down_read_nested(&vma->vm_lock->lock, subclass); > + down_read_nested(&vma->vm_lock.lock, subclass); > } > > /* > @@ -743,13 +749,13 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int > static inline void vma_start_read_locked(struct vm_area_struct *vma) > { > mmap_assert_locked(vma->vm_mm); > - down_read(&vma->vm_lock->lock); > + down_read(&vma->vm_lock.lock); > } > > static inline void vma_end_read(struct vm_area_struct *vma) > { > rcu_read_lock(); /* keeps vma alive till the end of up_read */ > - up_read(&vma->vm_lock->lock); > + up_read(&vma->vm_lock.lock); > rcu_read_unlock(); > } > > @@ -778,7 +784,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) > if (__is_vma_write_locked(vma, &mm_lock_seq)) > return; > > - down_write(&vma->vm_lock->lock); > + down_write(&vma->vm_lock.lock); > /* > * We should use WRITE_ONCE() here because we can have concurrent reads > * from the early lockless pessimistic check in vma_start_read(). > @@ -786,7 +792,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) > * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. > */ > WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); > - up_write(&vma->vm_lock->lock); > + up_write(&vma->vm_lock.lock); > } > > static inline void vma_assert_write_locked(struct vm_area_struct *vma) > @@ -798,7 +804,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) > > static inline void vma_assert_locked(struct vm_area_struct *vma) > { > - if (!rwsem_is_locked(&vma->vm_lock->lock)) > + if (!rwsem_is_locked(&vma->vm_lock.lock)) > vma_assert_write_locked(vma); > } > > @@ -831,6 +837,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > #else /* CONFIG_PER_VMA_LOCK */ > > +static inline void vma_lock_init(struct vm_area_struct *vma) {} > static inline bool vma_start_read(struct vm_area_struct *vma) > { return false; } > static inline void vma_end_read(struct vm_area_struct *vma) {} > @@ -865,10 +872,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf) > > extern const struct vm_operations_struct vma_dummy_vm_ops; > > -/* > - * WARNING: vma_init does not initialize vma->vm_lock. > - * Use vm_area_alloc()/vm_area_free() if vma needs locking. > - */ > static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > { > memset(vma, 0, sizeof(*vma)); > @@ -877,6 +880,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > INIT_LIST_HEAD(&vma->anon_vma_chain); > vma_mark_detached(vma, false); > vma_numab_state_init(vma); > + vma_lock_init(vma); > } > > /* Use when VMA is not part of the VMA tree and needs no locking */ > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 80fef38d9d64..5c4bfdcfac72 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -716,8 +716,6 @@ struct vm_area_struct { > * slowpath. > */ > unsigned int vm_lock_seq; > - /* Unstable RCU readers are allowed to read this. */ > - struct vma_lock *vm_lock; > #endif > > /* > @@ -770,6 +768,10 @@ struct vm_area_struct { > struct vma_numab_state *numab_state; /* NUMA Balancing state */ > #endif > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > +#ifdef CONFIG_PER_VMA_LOCK > + /* Unstable RCU readers are allowed to read this. */ > + struct vma_lock vm_lock ____cacheline_aligned_in_smp; > +#endif > } __randomize_layout; > > #ifdef CONFIG_NUMA > diff --git a/kernel/fork.c b/kernel/fork.c > index 0061cf2450ef..7823797e31d2 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep; > /* SLAB cache for mm_struct structures (tsk->mm) */ > static struct kmem_cache *mm_cachep; > > -#ifdef CONFIG_PER_VMA_LOCK > - > -/* SLAB cache for vm_area_struct.lock */ > -static struct kmem_cache *vma_lock_cachep; > - > -static bool vma_lock_alloc(struct vm_area_struct *vma) > -{ > - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); > - if (!vma->vm_lock) > - return false; > - > - init_rwsem(&vma->vm_lock->lock); > - vma->vm_lock_seq = UINT_MAX; > - > - return true; > -} > - > -static inline void vma_lock_free(struct vm_area_struct *vma) > -{ > - kmem_cache_free(vma_lock_cachep, vma->vm_lock); > -} > - > -#else /* CONFIG_PER_VMA_LOCK */ > - > -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } > -static inline void vma_lock_free(struct vm_area_struct *vma) {} > - > -#endif /* CONFIG_PER_VMA_LOCK */ > - > struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) > { > struct vm_area_struct *vma; > @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) > return NULL; > > vma_init(vma, mm); > - if (!vma_lock_alloc(vma)) { > - kmem_cache_free(vm_area_cachep, vma); > - return NULL; > - } > > return vma; > } > @@ -496,10 +463,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > * will be reinitialized. > */ > data_race(memcpy(new, orig, sizeof(*new))); > - if (!vma_lock_alloc(new)) { > - kmem_cache_free(vm_area_cachep, new); > - return NULL; > - } > + vma_lock_init(new); > INIT_LIST_HEAD(&new->anon_vma_chain); > vma_numab_state_init(new); > dup_anon_vma_name(orig, new); > @@ -511,7 +475,6 @@ void __vm_area_free(struct vm_area_struct *vma) > { > vma_numab_state_free(vma); > free_anon_vma_name(vma); > - vma_lock_free(vma); > kmem_cache_free(vm_area_cachep, vma); > } > > @@ -522,7 +485,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) > vm_rcu); > > /* The vma should not be locked while being destroyed. */ > - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); > + VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma); > __vm_area_free(vma); > } > #endif > @@ -3168,11 +3131,9 @@ void __init proc_caches_init(void) > sizeof(struct fs_struct), 0, > SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, > NULL); > - > - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); > -#ifdef CONFIG_PER_VMA_LOCK > - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); > -#endif > + vm_area_cachep = KMEM_CACHE(vm_area_struct, > + SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC| > + SLAB_ACCOUNT); > mmap_init(); > nsproxy_cache_init(); > } > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > index 1d9fc97b8e80..11c2c38ca4e8 100644 > --- a/tools/testing/vma/vma_internal.h > +++ b/tools/testing/vma/vma_internal.h > @@ -230,10 +230,10 @@ struct vm_area_struct { > /* > * Can only be written (using WRITE_ONCE()) while holding both: > * - mmap_lock (in write mode) > - * - vm_lock->lock (in write mode) > + * - vm_lock.lock (in write mode) > * Can be read reliably while holding one of: > * - mmap_lock (in read or write mode) > - * - vm_lock->lock (in read or write mode) > + * - vm_lock.lock (in read or write mode) > * Can be read unreliably (using READ_ONCE()) for pessimistic bailout > * while holding nothing (except RCU to keep the VMA struct allocated). > * > @@ -242,7 +242,7 @@ struct vm_area_struct { > * slowpath. > */ > unsigned int vm_lock_seq; > - struct vma_lock *vm_lock; > + struct vma_lock vm_lock; > #endif > > /* > @@ -408,17 +408,10 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) > return mas_find(&vmi->mas, ULONG_MAX); > } > > -static inline bool vma_lock_alloc(struct vm_area_struct *vma) > +static inline void vma_lock_init(struct vm_area_struct *vma) > { > - vma->vm_lock = calloc(1, sizeof(struct vma_lock)); > - > - if (!vma->vm_lock) > - return false; > - > - init_rwsem(&vma->vm_lock->lock); > + init_rwsem(&vma->vm_lock.lock); > vma->vm_lock_seq = UINT_MAX; > - > - return true; > } > > static inline void vma_assert_write_locked(struct vm_area_struct *); > @@ -439,6 +432,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > vma->vm_ops = &vma_dummy_vm_ops; > INIT_LIST_HEAD(&vma->anon_vma_chain); > vma_mark_detached(vma, false); > + vma_lock_init(vma); > } > > static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) > @@ -449,10 +443,6 @@ static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) > return NULL; > > vma_init(vma, mm); > - if (!vma_lock_alloc(vma)) { > - free(vma); > - return NULL; > - } > > return vma; > } > @@ -465,10 +455,7 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > return NULL; > > memcpy(new, orig, sizeof(*new)); > - if (!vma_lock_alloc(new)) { > - free(new); > - return NULL; > - } > + vma_lock_init(new); > INIT_LIST_HEAD(&new->anon_vma_chain); > > return new; > @@ -638,14 +625,8 @@ static inline void mpol_put(struct mempolicy *) > { > } > > -static inline void vma_lock_free(struct vm_area_struct *vma) > -{ > - free(vma->vm_lock); > -} > - > static inline void __vm_area_free(struct vm_area_struct *vma) > { > - vma_lock_free(vma); > free(vma); > } > > -- > 2.47.0.338.g60cca15819-goog >