On Thu, Aug 23, 2018 at 01:59:16PM -0700, Mike Kravetz wrote: > The page migration code employs try_to_unmap() to try and unmap the > source page. This is accomplished by using rmap_walk to find all > vmas where the page is mapped. This search stops when page mapcount > is zero. For shared PMD huge pages, the page map count is always 1 > no matter the number of mappings. Shared mappings are tracked via > the reference count of the PMD page. Therefore, try_to_unmap stops > prematurely and does not completely unmap all mappings of the source > page. > > This problem can result is data corruption as writes to the original > source page can happen after contents of the page are copied to the > target page. Hence, data is lost. > > This problem was originally seen as DB corruption of shared global > areas after a huge page was soft offlined due to ECC memory errors. > DB developers noticed they could reproduce the issue by (hotplug) > offlining memory used to back huge pages. A simple testcase can > reproduce the problem by creating a shared PMD mapping (note that > this must be at least PUD_SIZE in size and PUD_SIZE aligned (1GB on > x86)), and using migrate_pages() to migrate process pages between > nodes while continually writing to the huge pages being migrated. > > To fix, have the try_to_unmap_one routine check for huge PMD sharing > by calling huge_pmd_unshare for hugetlbfs huge pages. If it is a > shared mapping it will be 'unshared' which removes the page table > entry and drops the reference on the PMD page. After this, flush > caches and TLB. > > mmu notifiers are called before locking page tables, but we can not > be sure of PMD sharing until page tables are locked. Therefore, > check for the possibility of PMD sharing before locking so that > notifiers can prepare for the worst possible case. > > Fixes: 39dde65c9940 ("shared page table for hugetlb page") > Cc: stable@xxxxxxxxxxxxxxx > Signed-off-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Thanks Mike, Reviewed-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> > --- > include/linux/hugetlb.h | 14 ++++++++++++++ > mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++++++-- > mm/rmap.c | 42 ++++++++++++++++++++++++++++++++++++++--- > 3 files changed, 91 insertions(+), 5 deletions(-) > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 36fa6a2a82e3..4ee95d8c8413 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, > pte_t *huge_pte_offset(struct mm_struct *mm, > unsigned long addr, unsigned long sz); > int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); > +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end); > struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, > int write); > struct page *follow_huge_pd(struct vm_area_struct *vma, > @@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void) > return 0; > } > > +static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, > + pte_t *ptep) > +{ > + return 0; > +} > + > +static inline void adjust_range_if_pmd_sharing_possible( > + struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > +{ > +} > + > #define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) > #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) > #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 3103099f64fd..a73c5728e961 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -4548,6 +4548,9 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, > return saddr; > } > > +#define _range_in_vma(vma, start, end) \ > + ((vma)->vm_start <= (start) && (end) <= (vma)->vm_end) > + > static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) > { > unsigned long base = addr & PUD_MASK; > @@ -4556,12 +4559,40 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) > /* > * check on proper vm_flags and page table alignment > */ > - if (vma->vm_flags & VM_MAYSHARE && > - vma->vm_start <= base && end <= vma->vm_end) > + if (vma->vm_flags & VM_MAYSHARE && _range_in_vma(vma, base, end)) > return true; > return false; > } > > +/* > + * Determine if start,end range within vma could be mapped by shared pmd. > + * If yes, adjust start and end to cover range associated with possible > + * shared pmd mappings. > + */ > +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > +{ > + unsigned long check_addr = *start; > + > + if (!(vma->vm_flags & VM_MAYSHARE)) > + return; > + > + for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { > + unsigned long a_start = check_addr & PUD_MASK; > + unsigned long a_end = a_start + PUD_SIZE; > + > + /* > + * If sharing is possible, adjust start/end if necessary. > + */ > + if (_range_in_vma(vma, a_start, a_end)) { > + if (a_start < *start) > + *start = a_start; > + if (a_end > *end) > + *end = a_end; > + } > + } > +} > + > /* > * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() > * and returns the corresponding pte. While this is not necessary for the > @@ -4659,6 +4690,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) > { > return 0; > } > + > +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > +{ > +} > #define want_pmd_share() (0) > #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ > > diff --git a/mm/rmap.c b/mm/rmap.c > index eb477809a5c0..1e79fac3186b 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > } > > /* > - * We have to assume the worse case ie pmd for invalidation. Note that > - * the page can not be free in this function as call of try_to_unmap() > - * must hold a reference on the page. > + * For THP, we have to assume the worse case ie pmd for invalidation. > + * For hugetlb, it could be much worse if we need to do pud > + * invalidation in the case of pmd sharing. > + * > + * Note that the page can not be free in this function as call of > + * try_to_unmap() must hold a reference on the page. > */ > end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); > + if (PageHuge(page)) { > + /* > + * If sharing is possible, start and end will be adjusted > + * accordingly. > + */ > + adjust_range_if_pmd_sharing_possible(vma, &start, &end); > + } > mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); > > while (page_vma_mapped_walk(&pvmw)) { > @@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, > subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); > address = pvmw.address; > > + if (PageHuge(page)) { > + if (huge_pmd_unshare(mm, &address, pvmw.pte)) { > + /* > + * huge_pmd_unshare unmapped an entire PMD > + * page. There is no way of knowing exactly > + * which PMDs may be cached for this mm, so > + * we must flush them all. start/end were > + * already adjusted above to cover this range. > + */ > + flush_cache_range(vma, start, end); > + flush_tlb_range(vma, start, end); > + mmu_notifier_invalidate_range(mm, start, end); > + > + /* > + * The ref count of the PMD page was dropped > + * which is part of the way map counting > + * is done for shared PMDs. Return 'true' > + * here. When there is no other sharing, > + * huge_pmd_unshare returns false and we will > + * unmap the actual page and drop map count > + * to zero. > + */ > + page_vma_mapped_walk_done(&pvmw); > + break; > + } > + } > > if (IS_ENABLED(CONFIG_MIGRATION) && > (flags & TTU_MIGRATION) && > -- > 2.17.1 > >