On Thu, Apr 27, 2017 at 05:52:44PM +0200, Laurent Dufour wrote: > From: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > > Manage the VMAs with SRCU such that we can do a lockless VMA lookup. > > We put the fput(vma->vm_file) in the SRCU callback, this keeps files > valid during speculative faults, this is possible due to the delayed > fput work by Al Viro -- do we need srcu_barrier() in unmount > someplace? > > We guard the mm_rb tree with a seqlock (XXX could be a seqcount but > we'd have to disable preemption around the write side in order to make > the retry loop in __read_seqcount_begin() work) such that we can know > if the rb tree walk was correct. We cannot trust the restult of a > lockless tree walk in the face of concurrent tree rotations; although > we can trust on the termination of such walks -- tree rotations > guarantee the end result is a tree again after all. > > Furthermore, we rely on the WMB implied by the > write_seqlock/count_begin() to separate the VMA initialization and the > publishing stores, analogous to the RELEASE in rcu_assign_pointer(). > We also rely on the RMB from read_seqretry() to separate the vma load > from further loads like the smp_read_barrier_depends() in regular > RCU. > > We must not touch the vmacache while doing SRCU lookups as that is not > properly serialized against changes. We update gap information after > publishing the VMA, but A) we don't use that and B) the seqlock > read side would fix that anyhow. > > We clear vma->vm_rb for nodes removed from the vma tree such that we > can easily detect such 'dead' nodes, we rely on the WMB from > write_sequnlock() to separate the tree removal and clearing the node. > > Provide find_vma_srcu() which wraps the required magic. > > XXX: mmap()/munmap() heavy workloads might suffer from the global lock > in call_srcu() -- this is fixable with a 'better' SRCU implementation. An alleged 'better' SRCU implementation is now in -tip at branch tip/core/rcu. This implementation maintains per-CPU callback lists, which eliminates the previous global lock. The goal is to get this 'better' SRCU into mainline during the upcoming merge window. Thanx, Paul > Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> > --- > include/linux/mm_types.h | 2 + > kernel/fork.c | 1 + > mm/init-mm.c | 1 + > mm/internal.h | 18 +++++++++ > mm/mmap.c | 100 +++++++++++++++++++++++++++++++++++------------ > 5 files changed, 96 insertions(+), 26 deletions(-) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index daa5fbba9349..f276973b0f91 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -359,6 +359,7 @@ struct vm_area_struct { > #endif > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > seqcount_t vm_sequence; > + struct rcu_head vm_rcu_head; > }; > > struct core_thread { > @@ -397,6 +398,7 @@ struct kioctx_table; > struct mm_struct { > struct vm_area_struct *mmap; /* list of VMAs */ > struct rb_root mm_rb; > + seqlock_t mm_seq; > u32 vmacache_seqnum; /* per-thread vmacache */ > #ifdef CONFIG_MMU > unsigned long (*get_unmapped_area) (struct file *filp, > diff --git a/kernel/fork.c b/kernel/fork.c > index 11c5c8ab827c..352cf3fd6c19 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -753,6 +753,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, > mm->mmap = NULL; > mm->mm_rb = RB_ROOT; > mm->vmacache_seqnum = 0; > + seqlock_init(&mm->mm_seq); > atomic_set(&mm->mm_users, 1); > atomic_set(&mm->mm_count, 1); > init_rwsem(&mm->mmap_sem); > diff --git a/mm/init-mm.c b/mm/init-mm.c > index 975e49f00f34..2b1fa061684f 100644 > --- a/mm/init-mm.c > +++ b/mm/init-mm.c > @@ -16,6 +16,7 @@ > > struct mm_struct init_mm = { > .mm_rb = RB_ROOT, > + .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq), > .pgd = swapper_pg_dir, > .mm_users = ATOMIC_INIT(2), > .mm_count = ATOMIC_INIT(1), > diff --git a/mm/internal.h b/mm/internal.h > index 7aa2ea0a8623..69df80ebc93d 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -40,6 +40,24 @@ void page_writeback_init(void); > > int do_swap_page(struct vm_fault *vmf); > > +extern struct srcu_struct vma_srcu; > + > +extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm, unsigned long addr); > + > +static inline bool vma_is_dead(struct vm_area_struct *vma, unsigned int sequence) > +{ > + int ret = RB_EMPTY_NODE(&vma->vm_rb); > + unsigned seq = ACCESS_ONCE(vma->vm_sequence.sequence); > + > + /* > + * Matches both the wmb in write_seqlock_{begin,end}() and > + * the wmb in vma_rb_erase(). > + */ > + smp_rmb(); > + > + return ret || seq != sequence; > +} > + > void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, > unsigned long floor, unsigned long ceiling); > > diff --git a/mm/mmap.c b/mm/mmap.c > index cb41659bc9f9..44e19aa31315 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -159,6 +159,23 @@ void unlink_file_vma(struct vm_area_struct *vma) > } > } > > +DEFINE_SRCU(vma_srcu); > + > +static void __free_vma(struct rcu_head *head) > +{ > + struct vm_area_struct *vma = > + container_of(head, struct vm_area_struct, vm_rcu_head); > + > + if (vma->vm_file) > + fput(vma->vm_file); > + kmem_cache_free(vm_area_cachep, vma); > +} > + > +static void free_vma(struct vm_area_struct *vma) > +{ > + call_srcu(&vma_srcu, &vma->vm_rcu_head, __free_vma); > +} > + > /* > * Close a vm structure and free it, returning the next. > */ > @@ -169,10 +186,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) > might_sleep(); > if (vma->vm_ops && vma->vm_ops->close) > vma->vm_ops->close(vma); > - if (vma->vm_file) > - fput(vma->vm_file); > mpol_put(vma_policy(vma)); > - kmem_cache_free(vm_area_cachep, vma); > + free_vma(vma); > return next; > } > > @@ -394,26 +409,37 @@ static void vma_gap_update(struct vm_area_struct *vma) > } > > static inline void vma_rb_insert(struct vm_area_struct *vma, > - struct rb_root *root) > + struct mm_struct *mm) > { > + struct rb_root *root = &mm->mm_rb; > + > /* All rb_subtree_gap values must be consistent prior to insertion */ > validate_mm_rb(root, NULL); > > rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); > } > > -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) > +static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm) > { > + struct rb_root *root = &mm->mm_rb; > /* > * Note rb_erase_augmented is a fairly large inline function, > * so make sure we instantiate it only once with our desired > * augmented rbtree callbacks. > */ > + write_seqlock(&mm->mm_seq); > rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); > + write_sequnlock(&mm->mm_seq); /* wmb */ > + > + /* > + * Ensure the removal is complete before clearing the node. > + * Matched by vma_is_dead()/handle_speculative_fault(). > + */ > + RB_CLEAR_NODE(&vma->vm_rb); > } > > static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, > - struct rb_root *root, > + struct mm_struct *mm, > struct vm_area_struct *ignore) > { > /* > @@ -421,21 +447,21 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, > * with the possible exception of the "next" vma being erased if > * next->vm_start was reduced. > */ > - validate_mm_rb(root, ignore); > + validate_mm_rb(&mm->mm_rb, ignore); > > - __vma_rb_erase(vma, root); > + __vma_rb_erase(vma, mm); > } > > static __always_inline void vma_rb_erase(struct vm_area_struct *vma, > - struct rb_root *root) > + struct mm_struct *mm) > { > /* > * All rb_subtree_gap values must be consistent prior to erase, > * with the possible exception of the vma being erased. > */ > - validate_mm_rb(root, vma); > + validate_mm_rb(&mm->mm_rb, vma); > > - __vma_rb_erase(vma, root); > + __vma_rb_erase(vma, mm); > } > > /* > @@ -552,10 +578,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, > * immediately update the gap to the correct value. Finally we > * rebalance the rbtree after all augmented values have been set. > */ > + write_seqlock(&mm->mm_seq); > rb_link_node(&vma->vm_rb, rb_parent, rb_link); > vma->rb_subtree_gap = 0; > vma_gap_update(vma); > - vma_rb_insert(vma, &mm->mm_rb); > + vma_rb_insert(vma, mm); > + write_sequnlock(&mm->mm_seq); > } > > static void __vma_link_file(struct vm_area_struct *vma) > @@ -631,7 +659,7 @@ static __always_inline void __vma_unlink_common(struct mm_struct *mm, > { > struct vm_area_struct *next; > > - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); > + vma_rb_erase_ignore(vma, mm, ignore); > next = vma->vm_next; > if (has_prev) > prev->vm_next = next; > @@ -883,21 +911,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, > } > > if (remove_next) { > - if (file) { > + if (file) > uprobe_munmap(next, next->vm_start, next->vm_end); > - fput(file); > - } > if (next->anon_vma) > anon_vma_merge(vma, next); > mm->map_count--; > mpol_put(vma_policy(next)); > - kmem_cache_free(vm_area_cachep, next); > - write_seqcount_end(&next->vm_sequence); > + free_vma(next); > /* > * In mprotect's case 6 (see comments on vma_merge), > * we must remove another next too. It would clutter > * up the code too much to do both in one go. > */ > + write_seqcount_end(&next->vm_sequence); > if (remove_next != 3) { > /* > * If "next" was removed and vma->vm_end was > @@ -2103,16 +2129,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, > EXPORT_SYMBOL(get_unmapped_area); > > /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ > -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) > +static struct vm_area_struct *__find_vma(struct mm_struct *mm, unsigned long addr) > { > struct rb_node *rb_node; > struct vm_area_struct *vma; > > - /* Check the cache first. */ > - vma = vmacache_find(mm, addr); > - if (likely(vma)) > - return vma; > - > rb_node = mm->mm_rb.rb_node; > > while (rb_node) { > @@ -2129,13 +2150,40 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) > rb_node = rb_node->rb_right; > } > > + return vma; > +} > + > +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) > +{ > + struct vm_area_struct *vma; > + > + /* Check the cache first. */ > + vma = vmacache_find(mm, addr); > + if (likely(vma)) > + return vma; > + > + vma = __find_vma(mm, addr); > if (vma) > vmacache_update(addr, vma); > return vma; > } > - > EXPORT_SYMBOL(find_vma); > > +struct vm_area_struct *find_vma_srcu(struct mm_struct *mm, unsigned long addr) > +{ > + struct vm_area_struct *vma; > + unsigned int seq; > + > + WARN_ON_ONCE(!srcu_read_lock_held(&vma_srcu)); > + > + do { > + seq = read_seqbegin(&mm->mm_seq); > + vma = __find_vma(mm, addr); > + } while (read_seqretry(&mm->mm_seq, seq)); > + > + return vma; > +} > + > /* > * Same as find_vma, but also return a pointer to the previous VMA in *pprev. > */ > @@ -2490,7 +2538,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, > insertion_point = (prev ? &prev->vm_next : &mm->mmap); > vma->vm_prev = NULL; > do { > - vma_rb_erase(vma, &mm->mm_rb); > + vma_rb_erase(vma, mm); > mm->map_count--; > tail_vma = vma; > vma = vma->vm_next; > -- > 2.7.4 > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>