On Tue, Sep 6, 2022 at 6:47 AM Laurent Dufour <ldufour@xxxxxxxxxxxxx> wrote: > > Le 01/09/2022 à 19:34, Suren Baghdasaryan a écrit : > > Introduce a per-VMA rw_semaphore to be used during page fault handling > > instead of mmap_lock. Because there are cases when multiple VMAs need > > to be exclusively locked during VMA tree modifications, instead of the > > usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock > > exclusively and setting vma->lock_seq to the current mm->lock_seq. When > > mmap_write_lock holder is done with all modifications and drops mmap_lock, > > it will increment mm->lock_seq, effectively unlocking all VMAs marked as > > locked. > > > > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> > Despite a minor comment below, > > Reviewed-by: Laurent Dufour <laurent.dufour@xxxxxxxxxx> Thanks for the reviews Laurent! I'll need some time to double-check all the VMA locking locations that you spotted as potentially unnecessary. Admittedly I was a bit paranoid when writing this patchset and trying not to miss any potential race, so some of them might indeed be unnecessary. Will reply to each of your comments once I confirm the need for locking in each case. Thanks, Suren. > > > --- > > include/linux/mm.h | 78 +++++++++++++++++++++++++++++++++++++++ > > include/linux/mm_types.h | 7 ++++ > > include/linux/mmap_lock.h | 13 +++++++ > > kernel/fork.c | 4 ++ > > mm/init-mm.c | 3 ++ > > 5 files changed, 105 insertions(+) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 7d322a979455..476bf936c5f0 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -611,6 +611,83 @@ struct vm_operations_struct { > > unsigned long addr); > > }; > > > > +#ifdef CONFIG_PER_VMA_LOCK > > +static inline void vma_init_lock(struct vm_area_struct *vma) > > +{ > > + init_rwsem(&vma->lock); > > + vma->vm_lock_seq = -1; > > +} > > + > > +static inline void vma_mark_locked(struct vm_area_struct *vma) > > +{ > > + int mm_lock_seq; > > + > > + mmap_assert_write_locked(vma->vm_mm); > > + > > + /* > > + * current task is holding mmap_write_lock, both vma->vm_lock_seq and > > + * mm->mm_lock_seq can't be concurrently modified. > > + */ > > + mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); > > + if (vma->vm_lock_seq == mm_lock_seq) > > + return; > > + > > + down_write(&vma->lock); > > + vma->vm_lock_seq = mm_lock_seq; > > + up_write(&vma->lock); > > +} > > + > > +static inline bool vma_read_trylock(struct vm_area_struct *vma) > > +{ > > + if (unlikely(down_read_trylock(&vma->lock) == 0)) > > + return false; > > + > > + /* > > + * Overflow might produce false locked result but it's not critical. > > It might be good to precise here that in the case of false locked, the > caller is assumed to fallback read locking the mm entirely before doing its > change relative to that VMA. Ack. > > > + * False unlocked result is critical but is impossible because we > > + * modify and check vma->vm_lock_seq under vma->lock protection and > > + * mm->mm_lock_seq modification invalidates all existing locks. > > + */ > > + if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) { > > + up_read(&vma->lock); > > + return false; > > + } > > + return true; > > +} > > + > > +static inline void vma_read_unlock(struct vm_area_struct *vma) > > +{ > > + up_read(&vma->lock); > > +} > > + > > +static inline void vma_assert_locked(struct vm_area_struct *vma) > > +{ > > + lockdep_assert_held(&vma->lock); > > + VM_BUG_ON_VMA(!rwsem_is_locked(&vma->lock), vma); > > +} > > + > > +static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos) > > +{ > > + mmap_assert_write_locked(vma->vm_mm); > > + /* > > + * current task is holding mmap_write_lock, both vma->vm_lock_seq and > > + * mm->mm_lock_seq can't be concurrently modified. > > + */ > > + VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma); > > +} > > + > > +#else /* CONFIG_PER_VMA_LOCK */ > > + > > +static inline void vma_init_lock(struct vm_area_struct *vma) {} > > +static inline void vma_mark_locked(struct vm_area_struct *vma) {} > > +static inline bool vma_read_trylock(struct vm_area_struct *vma) > > + { return false; } > > +static inline void vma_read_unlock(struct vm_area_struct *vma) {} > > +static inline void vma_assert_locked(struct vm_area_struct *vma) {} > > +static inline void vma_assert_write_locked(struct vm_area_struct *vma, int pos) {} > > + > > +#endif /* CONFIG_PER_VMA_LOCK */ > > + > > static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > > { > > static const struct vm_operations_struct dummy_vm_ops = {}; > > @@ -619,6 +696,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) > > vma->vm_mm = mm; > > vma->vm_ops = &dummy_vm_ops; > > INIT_LIST_HEAD(&vma->anon_vma_chain); > > + vma_init_lock(vma); > > } > > > > static inline void vma_set_anonymous(struct vm_area_struct *vma) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > index bed25ef7c994..6a03f59c1e78 100644 > > --- a/include/linux/mm_types.h > > +++ b/include/linux/mm_types.h > > @@ -486,6 +486,10 @@ struct vm_area_struct { > > struct mempolicy *vm_policy; /* NUMA policy for the VMA */ > > #endif > > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > > +#ifdef CONFIG_PER_VMA_LOCK > > + struct rw_semaphore lock; > > + int vm_lock_seq; > > +#endif > > } __randomize_layout; > > > > struct kioctx_table; > > @@ -567,6 +571,9 @@ struct mm_struct { > > * init_mm.mmlist, and are protected > > * by mmlist_lock > > */ > > +#ifdef CONFIG_PER_VMA_LOCK > > + int mm_lock_seq; > > +#endif > > > > > > unsigned long hiwater_rss; /* High-watermark of RSS usage */ > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > > index e49ba91bb1f0..a391ae226564 100644 > > --- a/include/linux/mmap_lock.h > > +++ b/include/linux/mmap_lock.h > > @@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) > > VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); > > } > > > > +#ifdef CONFIG_PER_VMA_LOCK > > +static inline void vma_mark_unlocked_all(struct mm_struct *mm) > > +{ > > + mmap_assert_write_locked(mm); > > + /* No races during update due to exclusive mmap_lock being held */ > > + WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); > > +} > > +#else > > +static inline void vma_mark_unlocked_all(struct mm_struct *mm) {} > > +#endif > > + > > static inline void mmap_init_lock(struct mm_struct *mm) > > { > > init_rwsem(&mm->mmap_lock); > > @@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) > > static inline void mmap_write_unlock(struct mm_struct *mm) > > { > > __mmap_lock_trace_released(mm, true); > > + vma_mark_unlocked_all(mm); > > up_write(&mm->mmap_lock); > > } > > > > static inline void mmap_write_downgrade(struct mm_struct *mm) > > { > > __mmap_lock_trace_acquire_returned(mm, false, true); > > + vma_mark_unlocked_all(mm); > > downgrade_write(&mm->mmap_lock); > > } > > > > diff --git a/kernel/fork.c b/kernel/fork.c > > index 614872438393..bfab31ecd11e 100644 > > --- a/kernel/fork.c > > +++ b/kernel/fork.c > > @@ -475,6 +475,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) > > */ > > *new = data_race(*orig); > > INIT_LIST_HEAD(&new->anon_vma_chain); > > + vma_init_lock(new); > > new->vm_next = new->vm_prev = NULL; > > dup_anon_vma_name(orig, new); > > } > > @@ -1130,6 +1131,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > > seqcount_init(&mm->write_protect_seq); > > mmap_init_lock(mm); > > INIT_LIST_HEAD(&mm->mmlist); > > +#ifdef CONFIG_PER_VMA_LOCK > > + WRITE_ONCE(mm->mm_lock_seq, 0); > > +#endif > > mm_pgtables_bytes_init(mm); > > mm->map_count = 0; > > mm->locked_vm = 0; > > diff --git a/mm/init-mm.c b/mm/init-mm.c > > index fbe7844d0912..8399f90d631c 100644 > > --- a/mm/init-mm.c > > +++ b/mm/init-mm.c > > @@ -37,6 +37,9 @@ struct mm_struct init_mm = { > > .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), > > .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), > > .mmlist = LIST_HEAD_INIT(init_mm.mmlist), > > +#ifdef CONFIG_PER_VMA_LOCK > > + .mm_lock_seq = 0, > > +#endif > > .user_ns = &init_user_ns, > > .cpu_bitmap = CPU_BITS_NONE, > > #ifdef CONFIG_IOMMU_SVA >