* Suren Baghdasaryan <surenb@xxxxxxxxxx> [241226 12:07]: > rw_semaphore is a sizable structure of 40 bytes and consumes > considerable space for each vm_area_struct. However vma_lock has > two important specifics which can be used to replace rw_semaphore > with a simpler structure: > 1. Readers never wait. They try to take the vma_lock and fall back to > mmap_lock if that fails. > 2. Only one writer at a time will ever try to write-lock a vma_lock > because writers first take mmap_lock in write mode. > Because of these requirements, full rw_semaphore functionality is not > needed and we can replace rw_semaphore and the vma->detached flag with > a refcount (vm_refcnt). > When vma is in detached state, vm_refcnt is 0 and only a call to > vma_mark_attached() can take it out of this state. Note that unlike > before, now we enforce both vma_mark_attached() and vma_mark_detached() > to be done only after vma has been write-locked. vma_mark_attached() > changes vm_refcnt to 1 to indicate that it has been attached to the vma > tree. When a reader takes read lock, it increments vm_refcnt, unless the > top usable bit of vm_refcnt (0x40000000) is set, indicating presence of > a writer. When writer takes write lock, it both increments vm_refcnt and > sets the top usable bit to indicate its presence. If there are readers, > writer will wait using newly introduced mm->vma_writer_wait. Since all > writers take mmap_lock in write mode first, there can be only one writer > at a time. The last reader to release the lock will signal the writer > to wake up. > refcount might overflow if there are many competing readers, in which case > read-locking will fail. Readers are expected to handle such failures. I find the above a bit hard to parse. What I understand is: 1. all accesses increment the ref count. 2. readers cannot increment the ref count unless the writer bit is 0 (no write present) 3. writers must wait for the ref count to reach 2 (the tree + writer reference) before proceeding. 4. increment overflow must be handled by the readers. > > Suggested-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > Suggested-by: Matthew Wilcox <willy@xxxxxxxxxxxxx> > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> > --- > include/linux/mm.h | 100 +++++++++++++++++++++---------- > include/linux/mm_types.h | 22 ++++--- > kernel/fork.c | 13 ++-- > mm/init-mm.c | 1 + > mm/memory.c | 68 +++++++++++++++++---- > tools/testing/vma/linux/atomic.h | 5 ++ > tools/testing/vma/vma_internal.h | 66 +++++++++++--------- > 7 files changed, 185 insertions(+), 90 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index ea4c4228b125..99f4720d7e51 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -32,6 +32,7 @@ > #include <linux/memremap.h> > #include <linux/slab.h> > #include <linux/cacheinfo.h> > +#include <linux/rcuwait.h> > > struct mempolicy; > struct anon_vma; > @@ -697,12 +698,34 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} > #endif /* CONFIG_NUMA_BALANCING */ > > #ifdef CONFIG_PER_VMA_LOCK > -static inline void vma_lock_init(struct vm_area_struct *vma) > +static inline void vma_lockdep_init(struct vm_area_struct *vma) > { > - init_rwsem(&vma->vm_lock.lock); > +#ifdef CONFIG_DEBUG_LOCK_ALLOC > + static struct lock_class_key lockdep_key; > + > + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); > +#endif > +} > + > +static inline void vma_init_lock(struct vm_area_struct *vma, bool reset_refcnt) > +{ > + if (reset_refcnt) > + refcount_set(&vma->vm_refcnt, 0); > vma->vm_lock_seq = UINT_MAX; > } > > +static inline void vma_refcount_put(struct vm_area_struct *vma) > +{ > + int refcnt; > + > + if (!__refcount_dec_and_test(&vma->vm_refcnt, &refcnt)) { > + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); > + > + if (refcnt & VMA_LOCK_OFFSET) Couldn't we only wake on refcnt == VMA_LOCK_OFFSET + 2? Right now you will wake on every departed reader, I think? We know refcnt is only going down if VMA_LOCK_OFFSET is set. Also, maybe a #define for VMA_LOCK_WRITER_ONLY or some better name? > + rcuwait_wake_up(&vma->vm_mm->vma_writer_wait); > + } > +} > + > /* > * Try to read-lock a vma. The function is allowed to occasionally yield false > * locked result to avoid performance overhead, in which case we fall back to > @@ -710,6 +733,8 @@ static inline void vma_lock_init(struct vm_area_struct *vma) > */ > static inline bool vma_start_read(struct vm_area_struct *vma) > { > + int oldcnt; > + > /* > * Check before locking. A race might cause false locked result. > * We can use READ_ONCE() for the mm_lock_seq here, and don't need > @@ -720,13 +745,20 @@ static inline bool vma_start_read(struct vm_area_struct *vma) > if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) > return false; > > - if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) > + > + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 0, _RET_IP_); > + /* Limit at VMA_REF_LIMIT to leave one count for a writer */ > + if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt, > + VMA_REF_LIMIT))) { > + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); > return false; > + } > + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); > > /* > - * Overflow might produce false locked result. > + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. > * False unlocked result is impossible because we modify and check > - * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq > + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq > * modification invalidates all existing locks. > * > * We must use ACQUIRE semantics for the mm_lock_seq so that if we are > @@ -734,10 +766,12 @@ static inline bool vma_start_read(struct vm_area_struct *vma) > * after it has been unlocked. > * This pairs with RELEASE semantics in vma_end_write_all(). > */ > - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { > - up_read(&vma->vm_lock.lock); > + if (unlikely(oldcnt & VMA_LOCK_OFFSET || > + vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { > + vma_refcount_put(vma); > return false; > } > + > return true; > } > > @@ -749,8 +783,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma) > */ > static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) > { > + int oldcnt; > + > mmap_assert_locked(vma->vm_mm); > - down_read_nested(&vma->vm_lock.lock, subclass); > + rwsem_acquire_read(&vma->vmlock_dep_map, subclass, 0, _RET_IP_); > + /* Limit at VMA_REF_LIMIT to leave one count for a writer */ > + if (unlikely(!__refcount_inc_not_zero_limited(&vma->vm_refcnt, &oldcnt, > + VMA_REF_LIMIT))) { > + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); > + return false; > + } > + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); > return true; > } > > @@ -762,15 +805,13 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int > */ > static inline bool vma_start_read_locked(struct vm_area_struct *vma) > { > - mmap_assert_locked(vma->vm_mm); > - down_read(&vma->vm_lock.lock); > - return true; > + return vma_start_read_locked_nested(vma, 0); > } > > static inline void vma_end_read(struct vm_area_struct *vma) > { > rcu_read_lock(); /* keeps vma alive till the end of up_read */ > - up_read(&vma->vm_lock.lock); > + vma_refcount_put(vma); > rcu_read_unlock(); > } > > @@ -813,36 +854,33 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) > > static inline void vma_assert_locked(struct vm_area_struct *vma) > { > - if (!rwsem_is_locked(&vma->vm_lock.lock)) > + if (refcount_read(&vma->vm_refcnt) <= 1) > vma_assert_write_locked(vma); > } > > +/* > + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these > + * assertions should be made either under mmap_write_lock or when the object > + * has been isolated under mmap_write_lock, ensuring no competing writers. > + */ > static inline void vma_assert_attached(struct vm_area_struct *vma) > { > - VM_BUG_ON_VMA(vma->detached, vma); > + VM_BUG_ON_VMA(!refcount_read(&vma->vm_refcnt), vma); > } > > static inline void vma_assert_detached(struct vm_area_struct *vma) > { > - VM_BUG_ON_VMA(!vma->detached, vma); > + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt), vma); > } > > static inline void vma_mark_attached(struct vm_area_struct *vma) > { > - vma->detached = false; > -} > - > -static inline void vma_mark_detached(struct vm_area_struct *vma) > -{ > - /* When detaching vma should be write-locked */ > vma_assert_write_locked(vma); > - vma->detached = true; > + vma_assert_detached(vma); > + refcount_set(&vma->vm_refcnt, 1); > } > > -static inline bool is_vma_detached(struct vm_area_struct *vma) > -{ > - return vma->detached; > -} > +void vma_mark_detached(struct vm_area_struct *vma); > > static inline void release_fault_lock(struct vm_fault *vmf) > { > @@ -865,7 +903,8 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > > #else /* CONFIG_PER_VMA_LOCK */ > > -static inline void vma_lock_init(struct vm_area_struct *vma) {} > +static inline void vma_lockdep_init(struct vm_area_struct *vma) {} > +static inline void vma_init_lock(struct vm_area_struct *vma, bool reset_refcnt) {} > static inline bool vma_start_read(struct vm_area_struct *vma) > { return false; } > static inline void vma_end_read(struct vm_area_struct *vma) {} > @@ -908,12 +947,9 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > vma->vm_mm = mm; > vma->vm_ops = &vma_dummy_vm_ops; > INIT_LIST_HEAD(&vma->anon_vma_chain); > -#ifdef CONFIG_PER_VMA_LOCK > - /* vma is not locked, can't use vma_mark_detached() */ > - vma->detached = true; > -#endif > vma_numab_state_init(vma); > - vma_lock_init(vma); > + vma_lockdep_init(vma); > + vma_init_lock(vma, false); > } > > /* Use when VMA is not part of the VMA tree and needs no locking */ > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 6573d95f1d1e..b5312421dec6 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -19,6 +19,7 @@ > #include <linux/workqueue.h> > #include <linux/seqlock.h> > #include <linux/percpu_counter.h> > +#include <linux/types.h> > > #include <asm/mmu.h> > > @@ -629,9 +630,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) > } > #endif > > -struct vma_lock { > - struct rw_semaphore lock; > -}; > +#define VMA_LOCK_OFFSET 0x40000000 > +#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 2) > > struct vma_numab_state { > /* > @@ -709,19 +709,13 @@ struct vm_area_struct { > }; > > #ifdef CONFIG_PER_VMA_LOCK > - /* > - * Flag to indicate areas detached from the mm->mm_mt tree. > - * Unstable RCU readers are allowed to read this. > - */ > - bool detached; > - > /* > * Can only be written (using WRITE_ONCE()) while holding both: > * - mmap_lock (in write mode) > - * - vm_lock->lock (in write mode) > + * - vm_refcnt bit at VMA_LOCK_OFFSET is set > * Can be read reliably while holding one of: > * - mmap_lock (in read or write mode) > - * - vm_lock->lock (in read or write mode) > + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 > * Can be read unreliably (using READ_ONCE()) for pessimistic bailout > * while holding nothing (except RCU to keep the VMA struct allocated). > * > @@ -784,7 +778,10 @@ struct vm_area_struct { > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > #ifdef CONFIG_PER_VMA_LOCK > /* Unstable RCU readers are allowed to read this. */ > - struct vma_lock vm_lock ____cacheline_aligned_in_smp; > + refcount_t vm_refcnt ____cacheline_aligned_in_smp; > +#ifdef CONFIG_DEBUG_LOCK_ALLOC > + struct lockdep_map vmlock_dep_map; > +#endif > #endif > } __randomize_layout; > > @@ -919,6 +916,7 @@ struct mm_struct { > * by mmlist_lock > */ > #ifdef CONFIG_PER_VMA_LOCK > + struct rcuwait vma_writer_wait; > /* > * This field has lock-like semantics, meaning it is sometimes > * accessed with ACQUIRE/RELEASE semantics. > diff --git a/kernel/fork.c b/kernel/fork.c > index d4c75428ccaf..7a0800d48112 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -463,12 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > * will be reinitialized. > */ > data_race(memcpy(new, orig, sizeof(*new))); > - vma_lock_init(new); > + vma_init_lock(new, true); > INIT_LIST_HEAD(&new->anon_vma_chain); > -#ifdef CONFIG_PER_VMA_LOCK > - /* vma is not locked, can't use vma_mark_detached() */ > - new->detached = true; > -#endif > vma_numab_state_init(new); > dup_anon_vma_name(orig, new); > > @@ -477,6 +473,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > > void __vm_area_free(struct vm_area_struct *vma) > { > + /* The vma should be detached while being destroyed. */ > + vma_assert_detached(vma); > vma_numab_state_free(vma); > free_anon_vma_name(vma); > kmem_cache_free(vm_area_cachep, vma); > @@ -488,8 +486,6 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) > struct vm_area_struct *vma = container_of(head, struct vm_area_struct, > vm_rcu); > > - /* The vma should not be locked while being destroyed. */ > - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma); > __vm_area_free(vma); > } > #endif > @@ -1223,6 +1219,9 @@ static inline void mmap_init_lock(struct mm_struct *mm) > { > init_rwsem(&mm->mmap_lock); > mm_lock_seqcount_init(mm); > +#ifdef CONFIG_PER_VMA_LOCK > + rcuwait_init(&mm->vma_writer_wait); > +#endif > } > > static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > diff --git a/mm/init-mm.c b/mm/init-mm.c > index 6af3ad675930..4600e7605cab 100644 > --- a/mm/init-mm.c > +++ b/mm/init-mm.c > @@ -40,6 +40,7 @@ struct mm_struct init_mm = { > .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), > .mmlist = LIST_HEAD_INIT(init_mm.mmlist), > #ifdef CONFIG_PER_VMA_LOCK > + .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait), > .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), > #endif > .user_ns = &init_user_ns, > diff --git a/mm/memory.c b/mm/memory.c > index 236fdecd44d6..2def47b5dff0 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -6328,9 +6328,39 @@ struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, > #endif > > #ifdef CONFIG_PER_VMA_LOCK > +static inline bool __vma_enter_locked(struct vm_area_struct *vma, unsigned int tgt_refcnt) > +{ > + /* > + * If vma is detached then only vma_mark_attached() can raise the > + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). > + */ > + if (!refcount_inc_not_zero(&vma->vm_refcnt)) > + return false; Can't the write lock overflow the ref count too? > + > + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); > + /* vma is attached, set the writer present bit */ > + refcount_add(VMA_LOCK_OFFSET, &vma->vm_refcnt); > + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, > + refcount_read(&vma->vm_refcnt) == tgt_refcnt, > + TASK_UNINTERRUPTIBLE); > + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); > + > + return true; > +} > + > +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) > +{ > + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET + 1, &vma->vm_refcnt); > + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); > +} > + > void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) > { > - down_write(&vma->vm_lock.lock); > + bool locked; > + > + /* Wait until refcnt is (VMA_LOCK_OFFSET + 2) => attached with no readers */ > + locked = __vma_enter_locked(vma, VMA_LOCK_OFFSET + 2); Does it need to take a ref count at all? Could we just set the write bit and wait for it to become 1 instead? That is, 1 would represent detached or writer is about to attach/detach it. If we do need it to be ref counted for the writer, we could set the write bit and the wait for the ref to be 1 before incrementing it to 2? I think this would be safer as we know there is only one writer and the readers can only decrease after setting the write bit. > + > /* > * We should use WRITE_ONCE() here because we can have concurrent reads > * from the early lockless pessimistic check in vma_start_read(). > @@ -6338,10 +6368,36 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) > * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. > */ > WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); > - up_write(&vma->vm_lock.lock); > + > + if (locked) { > + bool detached; > + > + __vma_exit_locked(vma, &detached); > + VM_BUG_ON_VMA(detached, vma); /* vma should remain attached */ > + } > } > EXPORT_SYMBOL_GPL(__vma_start_write); > > +void vma_mark_detached(struct vm_area_struct *vma) > +{ > + vma_assert_write_locked(vma); > + vma_assert_attached(vma); > + > + /* We are the only writer, so no need to use vma_refcount_put(). */ > + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { > + /* > + * Wait until refcnt is (VMA_LOCK_OFFSET + 1) => detached with > + * no readers > + */ > + if (__vma_enter_locked(vma, VMA_LOCK_OFFSET + 1)) { > + bool detached; > + > + __vma_exit_locked(vma, &detached); > + VM_BUG_ON_VMA(!detached, vma); > + } > + } > +} > + > /* > * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be > * stable and not isolated. If the VMA is not found or is being modified the > @@ -6354,7 +6410,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > struct vm_area_struct *vma; > > rcu_read_lock(); > -retry: > vma = mas_walk(&mas); > if (!vma) > goto inval; > @@ -6362,13 +6417,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > if (!vma_start_read(vma)) > goto inval; > > - /* Check if the VMA got isolated after we found it */ > - if (is_vma_detached(vma)) { > - vma_end_read(vma); > - count_vm_vma_lock_event(VMA_LOCK_MISS); > - /* The area was replaced with another one */ > - goto retry; > - } > /* > * At this point, we have a stable reference to a VMA: The VMA is > * locked and we know it hasn't already been isolated. > diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h > index e01f66f98982..2e2021553196 100644 > --- a/tools/testing/vma/linux/atomic.h > +++ b/tools/testing/vma/linux/atomic.h > @@ -9,4 +9,9 @@ > #define atomic_set(x, y) do {} while (0) > #define U8_MAX UCHAR_MAX > > +#ifndef atomic_cmpxchg_relaxed > +#define atomic_cmpxchg_relaxed uatomic_cmpxchg > +#define atomic_cmpxchg_release uatomic_cmpxchg > +#endif /* atomic_cmpxchg_relaxed */ > + > #endif /* _LINUX_ATOMIC_H */ > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > index 2a624f9304da..1e8cd2f013fa 100644 > --- a/tools/testing/vma/vma_internal.h > +++ b/tools/testing/vma/vma_internal.h > @@ -25,7 +25,7 @@ > #include <linux/maple_tree.h> > #include <linux/mm.h> > #include <linux/rbtree.h> > -#include <linux/rwsem.h> > +#include <linux/refcount.h> > > extern unsigned long stack_guard_gap; > #ifdef CONFIG_MMU > @@ -132,10 +132,6 @@ typedef __bitwise unsigned int vm_fault_t; > */ > #define pr_warn_once pr_err > > -typedef struct refcount_struct { > - atomic_t refs; > -} refcount_t; > - > struct kref { > refcount_t refcount; > }; > @@ -228,15 +224,12 @@ struct mm_struct { > unsigned long def_flags; > }; > > -struct vma_lock { > - struct rw_semaphore lock; > -}; > - > - > struct file { > struct address_space *f_mapping; > }; > > +#define VMA_LOCK_OFFSET 0x40000000 > + > struct vm_area_struct { > /* The first cache line has the info for VMA tree walking. */ > > @@ -264,16 +257,13 @@ struct vm_area_struct { > }; > > #ifdef CONFIG_PER_VMA_LOCK > - /* Flag to indicate areas detached from the mm->mm_mt tree */ > - bool detached; > - > /* > * Can only be written (using WRITE_ONCE()) while holding both: > * - mmap_lock (in write mode) > - * - vm_lock.lock (in write mode) > + * - vm_refcnt bit at VMA_LOCK_OFFSET is set > * Can be read reliably while holding one of: > * - mmap_lock (in read or write mode) > - * - vm_lock.lock (in read or write mode) > + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 > * Can be read unreliably (using READ_ONCE()) for pessimistic bailout > * while holding nothing (except RCU to keep the VMA struct allocated). > * > @@ -282,7 +272,6 @@ struct vm_area_struct { > * slowpath. > */ > unsigned int vm_lock_seq; > - struct vma_lock vm_lock; > #endif > > /* > @@ -335,6 +324,10 @@ struct vm_area_struct { > struct vma_numab_state *numab_state; /* NUMA Balancing state */ > #endif > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > +#ifdef CONFIG_PER_VMA_LOCK > + /* Unstable RCU readers are allowed to read this. */ > + refcount_t vm_refcnt; > +#endif > } __randomize_layout; > > struct vm_fault {}; > @@ -459,23 +452,41 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) > return mas_find(&vmi->mas, ULONG_MAX); > } > > -static inline void vma_lock_init(struct vm_area_struct *vma) > +/* > + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these > + * assertions should be made either under mmap_write_lock or when the object > + * has been isolated under mmap_write_lock, ensuring no competing writers. > + */ > +static inline void vma_assert_attached(struct vm_area_struct *vma) > { > - init_rwsem(&vma->vm_lock.lock); > - vma->vm_lock_seq = UINT_MAX; > + VM_BUG_ON_VMA(!refcount_read(&vma->vm_refcnt), vma); > } > > -static inline void vma_mark_attached(struct vm_area_struct *vma) > +static inline void vma_assert_detached(struct vm_area_struct *vma) > { > - vma->detached = false; > + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt), vma); > } > > static inline void vma_assert_write_locked(struct vm_area_struct *); > +static inline void vma_mark_attached(struct vm_area_struct *vma) > +{ > + vma_assert_write_locked(vma); > + vma_assert_detached(vma); > + refcount_set(&vma->vm_refcnt, 1); > +} > + > static inline void vma_mark_detached(struct vm_area_struct *vma) > { > - /* When detaching vma should be write-locked */ > vma_assert_write_locked(vma); > - vma->detached = true; > + vma_assert_attached(vma); > + > + /* We are the only writer, so no need to use vma_refcount_put(). */ > + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { > + /* > + * Reader must have temporarily raised vm_refcnt but it will > + * drop it without using the vma since vma is write-locked. > + */ > + } > } > > extern const struct vm_operations_struct vma_dummy_vm_ops; > @@ -488,9 +499,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > vma->vm_mm = mm; > vma->vm_ops = &vma_dummy_vm_ops; > INIT_LIST_HEAD(&vma->anon_vma_chain); > - /* vma is not locked, can't use vma_mark_detached() */ > - vma->detached = true; > - vma_lock_init(vma); > + vma->vm_lock_seq = UINT_MAX; > } > > static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) > @@ -513,10 +522,9 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > return NULL; > > memcpy(new, orig, sizeof(*new)); > - vma_lock_init(new); > + refcount_set(&new->vm_refcnt, 0); > + new->vm_lock_seq = UINT_MAX; > INIT_LIST_HEAD(&new->anon_vma_chain); > - /* vma is not locked, can't use vma_mark_detached() */ > - new->detached = true; > > return new; > } > -- > 2.47.1.613.gc27f4b7a9f-goog >