On Mon, Aug 29, 2016 at 05:35:48PM +0200, Andrea Arcangeli wrote: > Hello Kirill, > > On Mon, Aug 29, 2016 at 03:42:33PM +0300, Kirill A. Shutemov wrote: > > @@ -898,13 +899,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, > > /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ > > if (ret & VM_FAULT_RETRY) { > > down_read(&mm->mmap_sem); > > - if (hugepage_vma_revalidate(mm, address)) { > > + if (hugepage_vma_revalidate(mm, address, &vma)) { > > /* vma is no longer available, don't continue to swapin */ > > trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); > > return false; > > } > > /* check if the pmd is still valid */ > > - if (mm_find_pmd(mm, address) != pmd) > > + if (mm_find_pmd(mm, address) != pmd || vma != fe.vma) > > return false; > > } > > if (ret & VM_FAULT_ERROR) { > > You check if the vma changed if the mmap_sem was released by the > VM_FAULT_RETRY case but not below: > > /* > * Prevent all access to pagetables with the exception of > * gup_fast later handled by the ptep_clear_flush and the VM > > @@ -994,7 +995,7 @@ static void collapse_huge_page(struct mm_struct *mm, > > * handled by the anon_vma lock + PG_lock. > > */ > > down_write(&mm->mmap_sem); > > - result = hugepage_vma_revalidate(mm, address); > > + result = hugepage_vma_revalidate(mm, address, &vma); > > if (result) > > goto out; > > /* check if the pmd is still valid */ > if (mm_find_pmd(mm, address) != pmd) > goto out; > > Here you go ahead without care if the vma has changed as long as the > "vma" pointer was updated to the new one, and the pmd is still present > and stable (present and not huge) and all vma details matched as > before. > > Either we care that the vma changed in both places or we don't in > either of the two places. > > The idea was that even if the vma changed it doesn't matter because > it's still good to proceed for a collapse if all revalidation check > pass. > > What we failed at, was in refreshing the pointer of the vma to the new > one after the vma revalidation passed, so that the code that goes > ahead uses the right vma pointer and not the stale one we got > initially. > > Now it may give a perception that it is safer to check fa.vma != vma > but in reality it is not, because the vma may be freed and reallocated > in exactly the same address... > > So I think the vma != fe.vma check shall be removed because no matter > what the safety of the vma revalidate cannot come from checking if the > pointer has not changed and it must come from something else. [ Finally back to this. ] Here's updated version. >From 14d748bd8a7eb003efc10b1e5d5b8a644e7181b1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> Date: Mon, 29 Aug 2016 15:32:50 +0300 Subject: [PATCH] khugepaged: fix use-after-free in collapse_huge_page() hugepage_vma_revalidate() tries to re-check if we still should try to collapse small pages into huge one after the re-acquiring mmap_sem. The problem Dmitry Vyukov reported[1] is that the vma found by hugepage_vma_revalidate() can be suitable for huge pages, but not the same vma we had before dropping mmap_sem. And dereferencing original vma can lead to fun results.. Let's use vma hugepage_vma_revalidate() found instead of assuming it's the same as what we had before the lock was dropped. [1] http://lkml.kernel.org/r/CACT4Y+Z3gigBvhca9kRJFcjX0G70V_nRhbwKBU+yGoESBDKi9Q@xxxxxxxxxxxxxx Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> Reported-by: Dmitry Vyukov <dvyukov@xxxxxxxxxx> --- mm/khugepaged.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f401e9dfcc0c..728d7790dc2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -838,7 +838,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) * value (scan code). */ -static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + struct vm_area_struct **vmap) { struct vm_area_struct *vma; unsigned long hstart, hend; @@ -846,7 +847,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) if (unlikely(khugepaged_test_exit(mm))) return SCAN_ANY_PROCESS; - vma = find_vma(mm, address); + *vmap = vma = find_vma(mm, address); if (!vma) return SCAN_VMA_NULL; @@ -898,7 +899,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address)) { + if (hugepage_vma_revalidate(mm, address, &fe.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -923,7 +924,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma, int node, int referenced) { pmd_t *pmd, _pmd; @@ -933,6 +933,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated = 0, result = 0; struct mem_cgroup *memcg; + struct vm_area_struct *vma; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t gfp; @@ -961,7 +962,7 @@ static void collapse_huge_page(struct mm_struct *mm, } down_read(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address); + result = hugepage_vma_revalidate(mm, address, &vma); if (result) { mem_cgroup_cancel_charge(new_page, memcg, true); up_read(&mm->mmap_sem); @@ -994,7 +995,7 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address); + result = hugepage_vma_revalidate(mm, address, &vma); if (result) goto out; /* check if the pmd is still valid */ @@ -1202,7 +1203,7 @@ out_unmap: if (ret) { node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma, node, referenced); + collapse_huge_page(mm, address, hpage, node, referenced); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, -- Kirill A. Shutemov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>