On Mon, Jan 16, 2023 at 6:07 AM Hillf Danton <hdanton@xxxxxxxx> wrote: > > On Mon, 9 Jan 2023 12:53:36 -0800 Suren Baghdasaryan <surenb@xxxxxxxxxx> > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -627,12 +627,16 @@ static inline void vma_write_lock(struct vm_area_struct *vma) > > * mm->mm_lock_seq can't be concurrently modified. > > */ > > mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); > > - if (vma->vm_lock_seq == mm_lock_seq) > > + if (vma->vm_lock->lock_seq == mm_lock_seq) > > return; > > lock acquire for write to info lockdep. Thanks for the review Hillf! Good idea. Will add in the next version. > > > > - down_write(&vma->vm_lock->lock); > > - vma->vm_lock_seq = mm_lock_seq; > > - up_write(&vma->vm_lock->lock); > > + if (atomic_cmpxchg(&vma->vm_lock->count, 0, -1)) > > + wait_event(vma->vm_mm->vma_writer_wait, > > + atomic_cmpxchg(&vma->vm_lock->count, 0, -1) == 0); > > + vma->vm_lock->lock_seq = mm_lock_seq; > > + /* Write barrier to ensure lock_seq change is visible before count */ > > + smp_wmb(); > > + atomic_set(&vma->vm_lock->count, 0); > > } > > > > /* > > @@ -643,20 +647,28 @@ static inline void vma_write_lock(struct vm_area_struct *vma) > > static inline bool vma_read_trylock(struct vm_area_struct *vma) > > { > > /* Check before locking. A race might cause false locked result. */ > > - if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) > > + if (vma->vm_lock->lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) > > return false; > > Add mb to pair with the above wmb like The wmb above is to ensure the ordering between updates of lock_seq and vm_lock->count (lock_seq is updated first and vm_lock->count only after that). The first access to vm_lock->count in this function is atomic_inc_unless_negative() and it's an atomic RMW operation with a return value. According to documentation such functions are fully ordered, therefore I think we already have an implicit full memory barrier between reads of lock_seq and vm_lock->count here. Am I wrong? > > if (READ_ONCE(vma->vm_lock->lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) { > smp_acquire__after_ctrl_dep(); > return false; > } > > > > - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) > > + if (unlikely(!atomic_inc_unless_negative(&vma->vm_lock->count))) > > return false; > > > > + /* If atomic_t overflows, restore and fail to lock. */ > > + if (unlikely(atomic_read(&vma->vm_lock->count) < 0)) { > > + if (atomic_dec_and_test(&vma->vm_lock->count)) > > + wake_up(&vma->vm_mm->vma_writer_wait); > > + return false; > > + } > > + > > /* > > * Overflow might produce false locked result. > > * False unlocked result is impossible because we modify and check > > * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq > > * modification invalidates all existing locks. > > */ > > - if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { > > - up_read(&vma->vm_lock->lock); > > + if (unlikely(vma->vm_lock->lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { > > + if (atomic_dec_and_test(&vma->vm_lock->count)) > > + wake_up(&vma->vm_mm->vma_writer_wait); > > return false; > > } > > Simpler way to detect write lock owner and count overflow like > > int count = atomic_read(&vma->vm_lock->count); > for (;;) { > int new = count + 1; > > if (count < 0 || new < 0) > return false; > > new = atomic_cmpxchg(&vma->vm_lock->count, count, new); > if (new == count) > break; > count = new; > cpu_relax(); > } > > (wake up waiting readers after taking the lock; > but the write lock owner before this read trylock should be > responsible for waking waiters up.) > > lock acquire for read. This schema might cause readers to wait, which is not an exact replacement for down_read_trylock(). The requirement to wake up waiting readers also complicates things and since we can always fall back to mmap_lock, that complication is unnecessary IMHO. I could use part of your suggestion like this: int new = count + 1; if (count < 0 || new < 0) return false; new = atomic_cmpxchg(&vma->vm_lock->count, count, new); if (new == count) return false; Compared to doing atomic_inc_unless_negative() first, like I did originally, this schema opens a bit wider window for the writer to get in the middle and cause the reader to fail locking but I don't think it would result in any visible regression. > > > return true; > > @@ -664,7 +676,8 @@ static inline bool vma_read_trylock(struct vm_area_struct *vma) > > > > static inline void vma_read_unlock(struct vm_area_struct *vma) > > { > lock release for read. Ack. > > > - up_read(&vma->vm_lock->lock); > > + if (atomic_dec_and_test(&vma->vm_lock->count)) > > + wake_up(&vma->vm_mm->vma_writer_wait); > > } >