On Tue, Apr 16, 2019 at 03:45:02PM +0200, Laurent Dufour wrote: > If a thread is remapping an area while another one is faulting on the > destination area, the SPF handler may fetch the vma from the RB tree before > the pte has been moved by the other thread. This means that the moved ptes > will overwrite those create by the page fault handler leading to page > leaked. > > CPU 1 CPU2 > enter mremap() > unmap the dest area > copy_vma() Enter speculative page fault handler > >> at this time the dest area is present in the RB tree > fetch the vma matching dest area > create a pte as the VMA matched > Exit the SPF handler > <data written in the new page> > move_ptes() > > it is assumed that the dest area is empty, > > the move ptes overwrite the page mapped by the CPU2. > > To prevent that, when the VMA matching the dest area is extended or created > by copy_vma(), it should be marked as non available to the SPF handler. > The usual way to so is to rely on vm_write_begin()/end(). > This is already in __vma_adjust() called by copy_vma() (through > vma_merge()). But __vma_adjust() is calling vm_write_end() before returning > which create a window for another thread. > This patch adds a new parameter to vma_merge() which is passed down to > vma_adjust(). > The assumption is that copy_vma() is returning a vma which should be > released by calling vm_raw_write_end() by the callee once the ptes have > been moved. > > Signed-off-by: Laurent Dufour <ldufour@xxxxxxxxxxxxx> Reviewed-by: Jérôme Glisse <jglisse@xxxxxxxxxx> Small comment about a comment below but can be fix as a fixup patch nothing earth shattering. > --- > include/linux/mm.h | 24 ++++++++++++++++----- > mm/mmap.c | 53 +++++++++++++++++++++++++++++++++++----------- > mm/mremap.c | 13 ++++++++++++ > 3 files changed, 73 insertions(+), 17 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 906b9e06f18e..5d45b7d8718d 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2343,18 +2343,32 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); > > /* mmap.c */ > extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); > + > extern int __vma_adjust(struct vm_area_struct *vma, unsigned long start, > unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, > - struct vm_area_struct *expand); > + struct vm_area_struct *expand, bool keep_locked); > + > static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, > unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) > { > - return __vma_adjust(vma, start, end, pgoff, insert, NULL); > + return __vma_adjust(vma, start, end, pgoff, insert, NULL, false); > } > -extern struct vm_area_struct *vma_merge(struct mm_struct *, > + > +extern struct vm_area_struct *__vma_merge(struct mm_struct *mm, > + struct vm_area_struct *prev, unsigned long addr, unsigned long end, > + unsigned long vm_flags, struct anon_vma *anon, struct file *file, > + pgoff_t pgoff, struct mempolicy *mpol, > + struct vm_userfaultfd_ctx uff, bool keep_locked); > + > +static inline struct vm_area_struct *vma_merge(struct mm_struct *mm, > struct vm_area_struct *prev, unsigned long addr, unsigned long end, > - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, > - struct mempolicy *, struct vm_userfaultfd_ctx); > + unsigned long vm_flags, struct anon_vma *anon, struct file *file, > + pgoff_t off, struct mempolicy *pol, struct vm_userfaultfd_ctx uff) > +{ > + return __vma_merge(mm, prev, addr, end, vm_flags, anon, file, off, > + pol, uff, false); > +} > + > extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); > extern int __split_vma(struct mm_struct *, struct vm_area_struct *, > unsigned long addr, int new_below); > diff --git a/mm/mmap.c b/mm/mmap.c > index b77ec0149249..13460b38b0fb 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -714,7 +714,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm, > */ > int __vma_adjust(struct vm_area_struct *vma, unsigned long start, > unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, > - struct vm_area_struct *expand) > + struct vm_area_struct *expand, bool keep_locked) > { > struct mm_struct *mm = vma->vm_mm; > struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; > @@ -830,8 +830,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, > > importer->anon_vma = exporter->anon_vma; > error = anon_vma_clone(importer, exporter); > - if (error) > + if (error) { > + if (next && next != vma) > + vm_raw_write_end(next); > + vm_raw_write_end(vma); > return error; > + } > } > } > again: > @@ -1025,7 +1029,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, > > if (next && next != vma) > vm_raw_write_end(next); > - vm_raw_write_end(vma); > + if (!keep_locked) > + vm_raw_write_end(vma); > > validate_mm(mm); > > @@ -1161,12 +1166,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, > * parameter) may establish ptes with the wrong permissions of NNNN > * instead of the right permissions of XXXX. > */ > -struct vm_area_struct *vma_merge(struct mm_struct *mm, > +struct vm_area_struct *__vma_merge(struct mm_struct *mm, > struct vm_area_struct *prev, unsigned long addr, > unsigned long end, unsigned long vm_flags, > struct anon_vma *anon_vma, struct file *file, > pgoff_t pgoff, struct mempolicy *policy, > - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) > + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, > + bool keep_locked) > { > pgoff_t pglen = (end - addr) >> PAGE_SHIFT; > struct vm_area_struct *area, *next; > @@ -1214,10 +1220,11 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, > /* cases 1, 6 */ > err = __vma_adjust(prev, prev->vm_start, > next->vm_end, prev->vm_pgoff, NULL, > - prev); > + prev, keep_locked); > } else /* cases 2, 5, 7 */ > err = __vma_adjust(prev, prev->vm_start, > - end, prev->vm_pgoff, NULL, prev); > + end, prev->vm_pgoff, NULL, prev, > + keep_locked); > if (err) > return NULL; > khugepaged_enter_vma_merge(prev, vm_flags); > @@ -1234,10 +1241,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, > vm_userfaultfd_ctx)) { > if (prev && addr < prev->vm_end) /* case 4 */ > err = __vma_adjust(prev, prev->vm_start, > - addr, prev->vm_pgoff, NULL, next); > + addr, prev->vm_pgoff, NULL, next, > + keep_locked); > else { /* cases 3, 8 */ > err = __vma_adjust(area, addr, next->vm_end, > - next->vm_pgoff - pglen, NULL, next); > + next->vm_pgoff - pglen, NULL, next, > + keep_locked); > /* > * In case 3 area is already equal to next and > * this is a noop, but in case 8 "area" has > @@ -3259,9 +3268,20 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > > if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) > return NULL; /* should never get here */ > - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, > - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), > - vma->vm_userfaultfd_ctx); > + > + /* There is 3 cases to manage here in > + * AAAA AAAA AAAA AAAA > + * PPPP.... PPPP......NNNN PPPP....NNNN PP........NN > + * PPPPPPPP(A) PPPP..NNNNNNNN(B) PPPPPPPPPPPP(1) NULL > + * PPPPPPPPNNNN(2) > + * PPPPNNNNNNNN(3) > + * > + * new_vma == prev in case A,1,2 > + * new_vma == next in case B,3 > + */ > + new_vma = __vma_merge(mm, prev, addr, addr + len, vma->vm_flags, > + vma->anon_vma, vma->vm_file, pgoff, > + vma_policy(vma), vma->vm_userfaultfd_ctx, true); > if (new_vma) { > /* > * Source vma may have been merged into new_vma > @@ -3299,6 +3319,15 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, > get_file(new_vma->vm_file); > if (new_vma->vm_ops && new_vma->vm_ops->open) > new_vma->vm_ops->open(new_vma); > + /* > + * As the VMA is linked right now, it may be hit by the > + * speculative page fault handler. But we don't want it to > + * to start mapping page in this area until the caller has > + * potentially move the pte from the moved VMA. To prevent > + * that we protect it right now, and let the caller unprotect > + * it once the move is done. > + */ It would be better to say: /* * Block speculative page fault on the new VMA before "linking" it as * as once it is linked then it may be hit by speculative page fault. * But we don't want it to start mapping page in this area until the * caller has potentially move the pte from the moved VMA. To prevent * that we protect it before linking and let the caller unprotect it * once the move is done. */ > + vm_raw_write_begin(new_vma); > vma_link(mm, new_vma, prev, rb_link, rb_parent); > *need_rmap_locks = false; > } > diff --git a/mm/mremap.c b/mm/mremap.c > index fc241d23cd97..ae5c3379586e 100644 > --- a/mm/mremap.c > +++ b/mm/mremap.c > @@ -357,6 +357,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, > if (!new_vma) > return -ENOMEM; > > + /* new_vma is returned protected by copy_vma, to prevent speculative > + * page fault to be done in the destination area before we move the pte. > + * Now, we must also protect the source VMA since we don't want pages > + * to be mapped in our back while we are copying the PTEs. > + */ > + if (vma != new_vma) > + vm_raw_write_begin(vma); > + > moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, > need_rmap_locks); > if (moved_len < old_len) { > @@ -373,6 +381,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, > */ > move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, > true); > + if (vma != new_vma) > + vm_raw_write_end(vma); > vma = new_vma; > old_len = new_len; > old_addr = new_addr; > @@ -381,7 +391,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, > mremap_userfaultfd_prep(new_vma, uf); > arch_remap(mm, old_addr, old_addr + old_len, > new_addr, new_addr + new_len); > + if (vma != new_vma) > + vm_raw_write_end(vma); > } > + vm_raw_write_end(new_vma); > > /* Conceal VM_ACCOUNT so old reservation is not undone */ > if (vm_flags & VM_ACCOUNT) { > -- > 2.21.0 >