On Sun, Oct 27, 2024 at 5:57 PM Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> wrote: > > On Thu, Oct 24, 2024 at 1:52 PM Suren Baghdasaryan <surenb@xxxxxxxxxx> wrote: > > > > Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock > > variants to increment it, in-line with the usual seqcount usage pattern. > > This lets us check whether the mmap_lock is write-locked by checking > > mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be > > used when implementing mmap_lock speculation functions. > > As a result vm_lock_seq is also change to be unsigned to match the type > > of mm_lock_seq.sequence. > > > > Suggested-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx> > > --- > > Applies over mm-unstable > > > > This conversion was discussed at [1] and these patches will likely be > > incorporated into the next version od Andrii's patcset. > > I got a notification that this patch set was applied to mm-unstable by > Andrew. But I was wondering if Andrew and Peter would agree to move > the patches into tip's perf/core branch, given this is a dependency of > my pending uprobe series ([0]) and, as far as I'm aware, there is no > urgent need for this API in mm tree(s). If we route this through > mm-unstable, we either need a stable tag from mm tree for Peter to > merge into perf/core (a bit of a hassle for both Andrew and Peter, I'm > sure), or I'd have to wait for 5-6 weeks until after next merge window > closes, which would be a huge bummer, given I'd been at this for a > while already with basically done patches, and would prefer to get my > changes sooner. > > So, I'd very much prefer to just route these changes through > perf/core, if mm folks don't oppose this? In fact, I'll go ahead and > will send my patches with Suren's patches included with the assumption > that we can reroute all this. Thanks for understanding! > > P.S. And yeah, Suren's patches apply cleanly to perf/core just as > well, I checked. Sent all of that as v4, see [2] [2] https://lore.kernel.org/linux-trace-kernel/20241028010818.2487581-1-andrii@xxxxxxxxxx/ > > [0] https://lore.kernel.org/linux-trace-kernel/20241010205644.3831427-1-andrii@xxxxxxxxxx/ > > > The issue of the seqcount_t.sequence being an unsigned rather than > > unsigned long will be addressed separately in collaoration with Jann Horn. > > > > [1] https://lore.kernel.org/all/20241010205644.3831427-2-andrii@xxxxxxxxxx/ > > > > include/linux/mm.h | 12 +++---- > > include/linux/mm_types.h | 7 ++-- > > include/linux/mmap_lock.h | 58 +++++++++++++++++++++----------- > > kernel/fork.c | 5 +-- > > mm/init-mm.c | 2 +- > > tools/testing/vma/vma.c | 4 +-- > > tools/testing/vma/vma_internal.h | 4 +-- > > 7 files changed, 56 insertions(+), 36 deletions(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 4ef8cf1043f1..77644118b200 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -698,7 +698,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) > > * we don't rely on for anything - the mm_lock_seq read against which we > > * need ordering is below. > > */ > > - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) > > + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) > > return false; > > > > if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) > > @@ -715,7 +715,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) > > * after it has been unlocked. > > * This pairs with RELEASE semantics in vma_end_write_all(). > > */ > > - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { > > + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { > > up_read(&vma->vm_lock->lock); > > return false; > > } > > @@ -730,7 +730,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) > > } > > > > /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ > > -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) > > +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) > > { > > mmap_assert_write_locked(vma->vm_mm); > > > > @@ -738,7 +738,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) > > * current task is holding mmap_write_lock, both vma->vm_lock_seq and > > * mm->mm_lock_seq can't be concurrently modified. > > */ > > - *mm_lock_seq = vma->vm_mm->mm_lock_seq; > > + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; > > return (vma->vm_lock_seq == *mm_lock_seq); > > } > > > > @@ -749,7 +749,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) > > */ > > static inline void vma_start_write(struct vm_area_struct *vma) > > { > > - int mm_lock_seq; > > + unsigned int mm_lock_seq; > > > > if (__is_vma_write_locked(vma, &mm_lock_seq)) > > return; > > @@ -767,7 +767,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) > > > > static inline void vma_assert_write_locked(struct vm_area_struct *vma) > > { > > - int mm_lock_seq; > > + unsigned int mm_lock_seq; > > > > VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); > > } > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > index ff8627acbaa7..80fef38d9d64 100644 > > --- a/include/linux/mm_types.h > > +++ b/include/linux/mm_types.h > > @@ -715,7 +715,7 @@ struct vm_area_struct { > > * counter reuse can only lead to occasional unnecessary use of the > > * slowpath. > > */ > > - int vm_lock_seq; > > + unsigned int vm_lock_seq; > > /* Unstable RCU readers are allowed to read this. */ > > struct vma_lock *vm_lock; > > #endif > > @@ -887,6 +887,9 @@ struct mm_struct { > > * Roughly speaking, incrementing the sequence number is > > * equivalent to releasing locks on VMAs; reading the sequence > > * number can be part of taking a read lock on a VMA. > > + * Incremented every time mmap_lock is write-locked/unlocked. > > + * Initialized to 0, therefore odd values indicate mmap_lock > > + * is write-locked and even values that it's released. > > * > > * Can be modified under write mmap_lock using RELEASE > > * semantics. > > @@ -895,7 +898,7 @@ struct mm_struct { > > * Can be read with ACQUIRE semantics if not holding write > > * mmap_lock. > > */ > > - int mm_lock_seq; > > + seqcount_t mm_lock_seq; > > #endif > > > > > > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > > index de9dc20b01ba..6b3272686860 100644 > > --- a/include/linux/mmap_lock.h > > +++ b/include/linux/mmap_lock.h > > @@ -71,39 +71,38 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) > > } > > > > #ifdef CONFIG_PER_VMA_LOCK > > -/* > > - * Drop all currently-held per-VMA locks. > > - * This is called from the mmap_lock implementation directly before releasing > > - * a write-locked mmap_lock (or downgrading it to read-locked). > > - * This should normally NOT be called manually from other places. > > - * If you want to call this manually anyway, keep in mind that this will release > > - * *all* VMA write locks, including ones from further up the stack. > > - */ > > -static inline void vma_end_write_all(struct mm_struct *mm) > > +static inline void mm_lock_seqcount_init(struct mm_struct *mm) > > { > > - mmap_assert_write_locked(mm); > > - /* > > - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive > > - * mmap_lock being held. > > - * We need RELEASE semantics here to ensure that preceding stores into > > - * the VMA take effect before we unlock it with this store. > > - * Pairs with ACQUIRE semantics in vma_start_read(). > > - */ > > - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); > > + seqcount_init(&mm->mm_lock_seq); > > +} > > + > > +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) > > +{ > > + do_raw_write_seqcount_begin(&mm->mm_lock_seq); > > +} > > + > > +static inline void mm_lock_seqcount_end(struct mm_struct *mm) > > +{ > > + do_raw_write_seqcount_end(&mm->mm_lock_seq); > > } > > + > > #else > > -static inline void vma_end_write_all(struct mm_struct *mm) {} > > +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} > > +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} > > +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} > > #endif > > > > static inline void mmap_init_lock(struct mm_struct *mm) > > { > > init_rwsem(&mm->mmap_lock); > > + mm_lock_seqcount_init(mm); > > } > > > > static inline void mmap_write_lock(struct mm_struct *mm) > > { > > __mmap_lock_trace_start_locking(mm, true); > > down_write(&mm->mmap_lock); > > + mm_lock_seqcount_begin(mm); > > __mmap_lock_trace_acquire_returned(mm, true, true); > > } > > > > @@ -111,6 +110,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) > > { > > __mmap_lock_trace_start_locking(mm, true); > > down_write_nested(&mm->mmap_lock, subclass); > > + mm_lock_seqcount_begin(mm); > > __mmap_lock_trace_acquire_returned(mm, true, true); > > } > > > > @@ -120,10 +120,30 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) > > > > __mmap_lock_trace_start_locking(mm, true); > > ret = down_write_killable(&mm->mmap_lock); > > + if (!ret) > > + mm_lock_seqcount_begin(mm); > > __mmap_lock_trace_acquire_returned(mm, true, ret == 0); > > return ret; > > } > > > > +/* > > + * Drop all currently-held per-VMA locks. > > + * This is called from the mmap_lock implementation directly before releasing > > + * a write-locked mmap_lock (or downgrading it to read-locked). > > + * This should normally NOT be called manually from other places. > > + * If you want to call this manually anyway, keep in mind that this will release > > + * *all* VMA write locks, including ones from further up the stack. > > + */ > > +static inline void vma_end_write_all(struct mm_struct *mm) > > +{ > > + mmap_assert_write_locked(mm); > > + /* > > + * Nobody can concurrently modify mm->mm_lock_seq due to exclusive > > + * mmap_lock being held. > > + */ > > + mm_lock_seqcount_end(mm); > > +} > > + > > static inline void mmap_write_unlock(struct mm_struct *mm) > > { > > __mmap_lock_trace_released(mm, true); > > diff --git a/kernel/fork.c b/kernel/fork.c > > index fd528fb5e305..0cae6fc651f0 100644 > > --- a/kernel/fork.c > > +++ b/kernel/fork.c > > @@ -447,7 +447,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma) > > return false; > > > > init_rwsem(&vma->vm_lock->lock); > > - vma->vm_lock_seq = -1; > > + vma->vm_lock_seq = UINT_MAX; > > > > return true; > > } > > @@ -1260,9 +1260,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > > seqcount_init(&mm->write_protect_seq); > > mmap_init_lock(mm); > > INIT_LIST_HEAD(&mm->mmlist); > > -#ifdef CONFIG_PER_VMA_LOCK > > - mm->mm_lock_seq = 0; > > -#endif > > mm_pgtables_bytes_init(mm); > > mm->map_count = 0; > > mm->locked_vm = 0; > > diff --git a/mm/init-mm.c b/mm/init-mm.c > > index 24c809379274..6af3ad675930 100644 > > --- a/mm/init-mm.c > > +++ b/mm/init-mm.c > > @@ -40,7 +40,7 @@ struct mm_struct init_mm = { > > .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), > > .mmlist = LIST_HEAD_INIT(init_mm.mmlist), > > #ifdef CONFIG_PER_VMA_LOCK > > - .mm_lock_seq = 0, > > + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), > > #endif > > .user_ns = &init_user_ns, > > .cpu_bitmap = CPU_BITS_NONE, > > diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c > > index 8fab5e13c7c3..9bcf1736bf18 100644 > > --- a/tools/testing/vma/vma.c > > +++ b/tools/testing/vma/vma.c > > @@ -89,7 +89,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, > > * begun. Linking to the tree will have caused this to be incremented, > > * which means we will get a false positive otherwise. > > */ > > - vma->vm_lock_seq = -1; > > + vma->vm_lock_seq = UINT_MAX; > > > > return vma; > > } > > @@ -214,7 +214,7 @@ static bool vma_write_started(struct vm_area_struct *vma) > > int seq = vma->vm_lock_seq; > > > > /* We reset after each check. */ > > - vma->vm_lock_seq = -1; > > + vma->vm_lock_seq = UINT_MAX; > > > > /* The vma_start_write() stub simply increments this value. */ > > return seq > -1; > > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h > > index e76ff579e1fd..1d9fc97b8e80 100644 > > --- a/tools/testing/vma/vma_internal.h > > +++ b/tools/testing/vma/vma_internal.h > > @@ -241,7 +241,7 @@ struct vm_area_struct { > > * counter reuse can only lead to occasional unnecessary use of the > > * slowpath. > > */ > > - int vm_lock_seq; > > + unsigned int vm_lock_seq; > > struct vma_lock *vm_lock; > > #endif > > > > @@ -416,7 +416,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma) > > return false; > > > > init_rwsem(&vma->vm_lock->lock); > > - vma->vm_lock_seq = -1; > > + vma->vm_lock_seq = UINT_MAX; > > > > return true; > > } > > > > base-commit: 9c111059234a949a4d3442a413ade19cc65ab927 > > -- > > 2.47.0.163.g1226f6d8fa-goog > >