On Mon, 5 Aug 2024 13:13:57 +0100 Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> wrote: > In commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be > removed in mergeability test") we relaxed the VMA merge rules for VMAs > possessing a vm_ops->close() hook, permitting this operation in instances > where we wouldn't delete the VMA as part of the merge operation. > > This was later corrected in commit fc0c8f9089c2 ("mm, mmap: fix vma_merge() > case 7 with vma_ops->close") to account for a subtle case that the previous > commit had not taken into account. > > In both instances, we first rely on is_mergeable_vma() to determine whether > we might be dealing with a VMA that might be removed, taking advantage of > the fact that a 'previous' VMA will never be deleted, only VMAs that follow > it. > > The second patch corrects the instance where a merge of the previous VMA > into a subsequent one did not correctly check whether the subsequent VMA > had a vm_ops->close() handler. > > Both changes prevent merge cases that are actually permissible (for > instance a merge of a VMA into a following VMA with a vm_ops->close(), but > with no previous VMA, which would result in the next VMA being extended, > not deleted). > > In addition, both changes fail to consider the case where a VMA that would > otherwise be merged with the previous and next VMA might have > vm_ops->close(), on the assumption that for this to be the case, all three > would have to have the same vma->vm_file to be mergeable and thus the same > vm_ops. > > And in addition both changes operate at 50,000 feet, trying to guess > whether a VMA will be deleted. > > As we have majorly refactored the VMA merge operation and de-duplicated > code to the point where we know precisely where deletions will occur, this > patch removes the aforementioned checks altogether and instead explicitly > checks whether a VMA will be deleted. > > In cases where a reduced merge is still possible (where we merge both > previous and next VMA but the next VMA has a vm_ops->close hook, meaning we > could just merge the previous and current VMA), we do so, otherwise the > merge is not permitted. > > We take advantage of our userland testing to assert that this functions > correctly - replacing the previous limited vm_ops->close() tests with tests > for every single case where we delete a VMA. > > We also update all testing for both new and modified VMAs to set > vma->vm_ops->close() in every single instance where this would not prevent > the merge, to assert that we never do so. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@xxxxxxxxxx> > --- > mm/vma.c | 69 ++++++++----- > tools/testing/vma/vma.c | 213 ++++++++++++++++++++++++---------------- > 2 files changed, 173 insertions(+), 109 deletions(-) > > diff --git a/mm/vma.c b/mm/vma.c > index c55ae035f5d6..9c779fc65ba8 100644 > --- a/mm/vma.c > +++ b/mm/vma.c > @@ -10,14 +10,6 @@ > static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) > { > struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; > - /* > - * If the vma has a ->close operation then the driver probably needs to > - * release per-vma resources, so we don't attempt to merge those if the > - * caller indicates the current vma may be removed as part of the merge, > - * which is the case if we are attempting to merge the next VMA into > - * this one. > - */ > - bool may_remove_vma = merge_next; See my comment on PATCH 02/10. You're removing the local variable here, so maybe it need not be introduced in the first place? > if (!mpol_equal(vmg->policy, vma_policy(vma))) > return false; > @@ -33,8 +25,6 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex > return false; > if (vma->vm_file != vmg->file) > return false; > - if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) > - return false; > if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) > return false; > if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) > @@ -606,6 +596,12 @@ static int commit_merge(struct vma_merge_struct *vmg, > return 0; > } > > +/* We can only remove VMAs when merging if they do not have a close hook. */ > +static bool can_merge_remove_vma(struct vm_area_struct *vma) > +{ > + return !vma->vm_ops || !vma->vm_ops->close; > +} > + > /* > * vma_merge_modified - Attempt to merge VMAs based on a VMA having its > * attributes modified. > @@ -710,9 +706,30 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg) > > /* If we span the entire VMA, a merge implies it will be deleted. */ > merge_will_delete_vma = left_side && right_side; > - /* If we merge both VMAs, then next is also deleted. */ > + > + /* > + * If we need to remove vma in its entirety but are unable to do so, > + * we have no sensible recourse but to abort the merge. > + */ > + if (merge_will_delete_vma && !can_merge_remove_vma(vma)) > + return NULL; > + > + /* > + * If we merge both VMAs, then next is also deleted. This implies > + * merge_will_delete_vma also. > + */ > merge_will_delete_next = merge_both; > > + /* > + * If we cannot delete next, then we can reduce the operation to merging > + * prev and vma (thereby deleting vma). > + */ > + if (merge_will_delete_next && !can_merge_remove_vma(next)) { > + merge_will_delete_next = false; > + merge_right = false; > + merge_both = false; > + } > + > /* No matter what happens, we will be adjusting vma. */ > vma_start_write(vma); > > @@ -756,21 +773,12 @@ static struct vm_area_struct *vma_merge_modified(struct vma_merge_struct *vmg) > vmg->start = prev->vm_start; > vmg->pgoff = prev->vm_pgoff; > > - if (merge_will_delete_vma) { > - /* > - * can_vma_merge_after() assumed we would not be > - * removing vma, so it skipped the check for > - * vm_ops->close, but we are removing vma. > - */ > - if (vma->vm_ops && vma->vm_ops->close) > - err = -EINVAL; > - } else { > + if (!merge_will_delete_vma) { > adjust = vma; > adj_start = end - vma->vm_start; > } > > - if (!err) > - err = dup_anon_vma(prev, vma, &anon_dup); > + err = dup_anon_vma(prev, vma, &anon_dup); > } else { /* merge_right */ > /* > * |<----->| OR > @@ -886,6 +894,8 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > unsigned long end = vmg->end; > pgoff_t pgoff = vmg->pgoff; > pgoff_t pglen = PHYS_PFN(end - start); > + bool merge_next = false; > + struct anon_vma *anon_vma = vmg->anon_vma; Calling this "anon_vma" feels a bit too generic. IIUC you want to save the original vmg->anon_vma in case the VMA turns out to be ummergeable with the next VMA after vmg->anon_vma has already been modified. What about calling it "orig_anon_vma"? Petr T > > VM_WARN_ON(vmg->vma); > > @@ -916,8 +926,9 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > vmg->end = next->vm_end; > vmg->vma = next; > vmg->pgoff = next->vm_pgoff - pglen; > - > vmg->anon_vma = next->anon_vma; > + > + merge_next = true; > } > > /* If we can merge with the previous VMA, adjust vmg accordingly. */ > @@ -925,6 +936,16 @@ struct vm_area_struct *vma_merge_new_vma(struct vma_merge_struct *vmg) > vmg->start = prev->vm_start; > vmg->vma = prev; > vmg->pgoff = prev->vm_pgoff; > + > + /* > + * If this merge would result in removal of the next VMA but we > + * are not permitted to do so, reduce the operation to merging > + * prev and vma. > + */ > + if (merge_next && !can_merge_remove_vma(next)) { > + vmg->end = end; > + vmg->anon_vma = anon_vma; > + } > } else if (prev) { > vma_iter_next_range(vmg->vmi); > } > @@ -978,6 +999,8 @@ int vma_expand(struct vma_merge_struct *vmg) > int ret; > > remove_next = true; > + /* This should already have been checked by this point. */ > + VM_WARN_ON(!can_merge_remove_vma(next)); > vma_start_write(next); > ret = dup_anon_vma(vma, next, &anon_dup); > if (ret) > diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c > index e465dc22e2d0..0c0a6ffcfc98 100644 > --- a/tools/testing/vma/vma.c > +++ b/tools/testing/vma/vma.c > @@ -327,6 +327,9 @@ static bool test_vma_merge_new_vma(void) > struct anon_vma_chain dummy_anon_vma_chain_d = { > .anon_vma = &dummy_anon_vma, > }; > + const struct vm_operations_struct vm_ops = { > + .close = dummy_close, > + }; > int count; > struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d; > bool merged; > @@ -370,6 +373,7 @@ static bool test_vma_merge_new_vma(void) > * 0123456789abc > * AA*B DD CC > */ > + vma_a->vm_ops = &vm_ops; /* This should have no impact. */ > vma_b->anon_vma = &dummy_anon_vma; > vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged); > ASSERT_EQ(vma, vma_a); > @@ -406,6 +410,7 @@ static bool test_vma_merge_new_vma(void) > * AAAAA *DD CC > */ > vma_d->anon_vma = &dummy_anon_vma; > + vma_d->vm_ops = &vm_ops; /* This should have no impact. */ > vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged); > ASSERT_EQ(vma, vma_d); > /* Prepend. */ > @@ -423,6 +428,7 @@ static bool test_vma_merge_new_vma(void) > * 0123456789abc > * AAAAA*DDD CC > */ > + vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */ > vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged); > ASSERT_EQ(vma, vma_a); > /* Merge with A, delete D. */ > @@ -573,120 +579,145 @@ static bool test_vma_merge_with_close(void) > struct vma_merge_struct vmg = { > .vmi = &vmi, > }; > - struct vm_operations_struct vm_ops = {}; > - struct vm_area_struct *vma_next = > - alloc_and_link_vma(&mm, 0x2000, 0x3000, 2, flags); > - struct vm_area_struct *vma; > + const struct vm_operations_struct vm_ops = { > + .close = dummy_close, > + }; > + struct vm_area_struct *vma_prev, *vma_next, *vma; > > /* > - * When we merge VMAs we sometimes have to delete others as part of the > - * operation. > - * > - * Considering the two possible adjacent VMAs to which a VMA can be > - * merged: > - * > - * [ prev ][ vma ][ next ] > - * > - * In no case will we need to delete prev. If the operation is > - * mergeable, then prev will be extended with one or both of vma and > - * next deleted. > - * > - * As a result, during initial mergeability checks, only > - * can_vma_merge_before() (which implies the VMA being merged with is > - * 'next' as shown above) bothers to check to see whether the next VMA > - * has a vm_ops->close() callback that will need to be called when > - * removed. > - * > - * If it does, then we cannot merge as the resources that the close() > - * operation potentially clears down are tied only to the existing VMA > - * range and we have no way of extending those to the nearly merged one. > - * > - * We must consider two scenarios: > - * > - * A. > + * When merging VMAs we are not permitted to remove any VMA that has a > + * vm_ops->close() hook. > * > - * vm_ops->close: - - !NULL > - * [ prev ][ vma ][ next ] > - * > - * Where prev may or may not be present/mergeable. > - * > - * This is picked up by a specific check in can_vma_merge_before(). > - * > - * B. > - * > - * vm_ops->close: - !NULL > - * [ prev ][ vma ] > - * > - * Where prev and vma are present and mergeable. > - * > - * This is picked up by a specific check in vma_merge_modified(). > - * > - * IMPORTANT NOTE: We make the assumption that the following case: > + * This is because executing this hook may clear state that is pertinent > + * to the VMA range as a whole. > + */ > + > + /* > + * The only case of a new VMA merge that results in a VMA being deleted > + * is one where both the previous and next VMAs are merged - in this > + * instance the next VMA is deleted, and the previous VMA is extended. > * > - * - !NULL NULL > - * [ prev ][ vma ][ next ] > + * If we are unable to do so, we reduce the operation to simply > + * extending the prev VMA and not merging next. > * > - * Cannot occur, because vma->vm_ops being the same implies the same > - * vma->vm_file, and therefore this would mean that next->vm_ops->close > - * would be set too, and thus scenario A would pick this up. > + * 0123456789 > + * PPP**NNNN > + * -> > + * 0123456789 > + * PPPPPPNNN > */ > > - ASSERT_NE(vma_next, NULL); > + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); > + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); > + vma_next->vm_ops = &vm_ops; > + > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); > + ASSERT_EQ(vma_merge_new_vma(&vmg), vma_prev); > + ASSERT_EQ(vma_prev->vm_start, 0); > + ASSERT_EQ(vma_prev->vm_end, 0x5000); > + ASSERT_EQ(vma_prev->vm_pgoff, 0); > + > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); > > /* > - * SCENARIO A > + * When modifying an existing VMA there are further cases where we > + * delete VMAs. > + * > + * <> > + * 0123456789 > + * PPPVV > * > - * 0123 > - * *N > + * In this instance, if vma has a close hook, the merge simply cannot > + * proceed. > */ > > - /* Make the next VMA have a close() callback. */ > - vm_ops.close = dummy_close; > - vma_next->vm_ops = (const struct vm_operations_struct *)&vm_ops; > + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); > + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); > + vma->vm_ops = &vm_ops; > > - /* Our proposed VMA has characteristics that would otherwise be merged. */ > - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); > + vmg.prev = vma_prev; > + vmg.vma = vma; > > - /* The next VMA having a close() operator should cause the merge to fail.*/ > - ASSERT_EQ(vma_merge_new_vma(&vmg), NULL); > + ASSERT_EQ(vma_merge_modified(&vmg), NULL); > > - /* Now create the VMA so we can merge via modified flags */ > - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); > - vma = alloc_and_link_vma(&mm, 0x1000, 0x2000, 1, flags); > - vmg.vma = vma; > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); > > /* > - * The VMA being modified in a way that would otherwise merge should > - * also fail. > + * This case is mirrored if merging with next. > + * > + * <> > + * 0123456789 > + * VVNNNN > + * > + * In this instance, if vma has a close hook, the merge simply cannot > + * proceed. > */ > + > + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); > + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); > + vma->vm_ops = &vm_ops; > + > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); > + vmg.vma = vma; > + > ASSERT_EQ(vma_merge_modified(&vmg), NULL); > > - /* SCENARIO B > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); > + > + /* > + * Finally, we consider two variants of the case where we modify a VMA > + * to merge with both the previous and next VMAs. > * > - * 0123 > - * P* > + * The first variant is where vma has a close hook. In this instance, no > + * merge can proceed. > * > - * In order for this scenario to trigger, the VMA currently being > - * modified must also have a .close(). > + * <> > + * 0123456789 > + * PPPVVNNNN > */ > > - /* Reset VMG state. */ > - vmg_set_range(&vmg, 0x1000, 0x2000, 1, flags); > + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); > + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); > + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); > + vma->vm_ops = &vm_ops; > + > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); > + vmg.prev = vma_prev; > + vmg.vma = vma; > + > + ASSERT_EQ(vma_merge_modified(&vmg), NULL); > + > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 3); > + > /* > - * Make next unmergeable, and don't let the scenario A check pick this > - * up, we want to reproduce scenario B only. > + * The second variant is where next has a close hook. In this instance, > + * we reduce the operation to a merge between prev and vma. > + * > + * <> > + * 0123456789 > + * PPPVVNNNN > + * -> > + * 0123456789 > + * PPPPPNNNN > */ > - vma_next->vm_ops = NULL; > - vma_next->__vm_flags &= ~VM_MAYWRITE; > - /* Allocate prev. */ > - vmg.prev = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags); > - /* Assign a vm_ops->close() function to VMA explicitly. */ > - vma->vm_ops = (const struct vm_operations_struct *)&vm_ops; > + > + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); > + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); > + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); > + vma_next->vm_ops = &vm_ops; > + > + vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); > + vmg.prev = vma_prev; > vmg.vma = vma; > - /* Make sure merge does not occur. */ > - ASSERT_EQ(vma_merge_modified(&vmg), NULL); > > - cleanup_mm(&mm, &vmi); > + ASSERT_EQ(vma_merge_modified(&vmg), vma_prev); > + ASSERT_EQ(vma_prev->vm_start, 0); > + ASSERT_EQ(vma_prev->vm_end, 0x5000); > + ASSERT_EQ(vma_prev->vm_pgoff, 0); > + > + ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); > + > return true; > } > > @@ -699,6 +730,9 @@ static bool test_vma_merge_modified(void) > struct vma_merge_struct vmg = { > .vmi = &vmi, > }; > + const struct vm_operations_struct vm_ops = { > + .close = dummy_close, > + }; > > /* > * Merge right case - partial span. > @@ -711,7 +745,9 @@ static bool test_vma_merge_modified(void) > * VNNNNNN > */ > vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); > + vma->vm_ops = &vm_ops; /* This should have no impact. */ > vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); > + vma_next->vm_ops = &vm_ops; /* This should have no impact. */ > vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); > vmg.vma = vma; > vmg.prev = vma; > @@ -743,6 +779,7 @@ static bool test_vma_merge_modified(void) > */ > vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); > vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); > + vma_next->vm_ops = &vm_ops; /* This should have no impact. */ > vmg_set_range(&vmg, 0x2000, 0x6000, 2, flags); > vmg.vma = vma; > vma->anon_vma = &dummy_anon_vma; > @@ -768,7 +805,9 @@ static bool test_vma_merge_modified(void) > * PPPPPPV > */ > vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); > + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ > vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); > + vma->vm_ops = &vm_ops; /* This should have no impact. */ > vmg_set_range(&vmg, 0x3000, 0x6000, 3, flags); > vmg.prev = vma_prev; > vmg.vma = vma; > @@ -800,6 +839,7 @@ static bool test_vma_merge_modified(void) > * PPPPPPP > */ > vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); > + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ > vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); > vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); > vmg.prev = vma_prev; > @@ -827,6 +867,7 @@ static bool test_vma_merge_modified(void) > * PPPPPPPPPP > */ > vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); > + vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ > vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); > vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); > vmg_set_range(&vmg, 0x3000, 0x7000, 3, flags); > -- > 2.45.2 >