On Tue, Jan 28, 2014 at 08:57:44AM -0700, Khalid Aziz wrote: > Backport from upstream commit 27c73ae759774e63313c1fbfeb17ba076cea64c5 This seems to be a clean cherry-pick for the 3.5 kernel. I am queuing it both for the 3.5 and the 3.11 kernels. Thanks! Cheers, -- Luis > Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database > caused by THP") can cause dereference of a dangling pointer if > split_huge_page runs during PageHuge() if there are updates to the > tail_page->private field. > > Also it is repeating compound_head twice for hugetlbfs and it is running > compound_head+compound_trans_head for THP when a single one is needed in > both cases. > > The new code within the PageSlab() check doesn't need to verify that the > THP page size is never bigger than the smallest hugetlbfs page size, to > avoid memory corruption. > > A longstanding theoretical race condition was found while fixing the > above (see the change right after the skip_unlock label, that is > relevant for the compound_lock path too). > > By re-establishing the _mapcount tail refcounting for all compound > pages, this also fixes the below problem: > > echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages > > BUG: Bad page state in process bash pfn:59a01 > page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0 > page flags: 0x1c00000000008000(tail) > Modules linked in: > CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25 > Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > Call Trace: > dump_stack+0x55/0x76 > bad_page+0xd5/0x130 > free_pages_prepare+0x213/0x280 > __free_pages+0x36/0x80 > update_and_free_page+0xc1/0xd0 > free_pool_huge_page+0xc2/0xe0 > set_max_huge_pages.part.58+0x14c/0x220 > nr_hugepages_store_common.isra.60+0xd0/0xf0 > nr_hugepages_store+0x13/0x20 > kobj_attr_store+0xf/0x20 > sysfs_write_file+0x189/0x1e0 > vfs_write+0xc5/0x1f0 > SyS_write+0x55/0xb0 > system_call_fastpath+0x16/0x1b > > Signed-off-by: Khalid Aziz <khalid.aziz@xxxxxxxxxx> > Signed-off-by: Andrea Arcangeli <aarcange@xxxxxxxxxx> > Tested-by: Khalid Aziz <khalid.aziz@xxxxxxxxxx> > Cc: Pravin Shelar <pshelar@xxxxxxxxxx> > Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> > Cc: Ben Hutchings <bhutchings@xxxxxxxxxxxxxx> > Cc: Christoph Lameter <cl@xxxxxxxxx> > Cc: Johannes Weiner <jweiner@xxxxxxxxxx> > Cc: Mel Gorman <mgorman@xxxxxxx> > Cc: Rik van Riel <riel@xxxxxxxxxx> > Cc: Andi Kleen <andi@xxxxxxxxxxxxxx> > Cc: Minchan Kim <minchan@xxxxxxxxxx> > Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> > --- > include/linux/hugetlb.h | 6 +++ > mm/hugetlb.c | 17 ++++++++ > mm/swap.c | 107 ++++++++++++++++++++++++++++++++++------------- > 3 files changed, 101 insertions(+), 29 deletions(-) > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 6baa73d..4f42a9c 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -24,6 +24,7 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); > void hugepage_put_subpool(struct hugepage_subpool *spool); > > int PageHuge(struct page *page); > +int PageHeadHuge(struct page *page_head); > > void reset_vma_resv_huge_pages(struct vm_area_struct *vma); > int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); > @@ -85,6 +86,11 @@ static inline int PageHuge(struct page *page) > return 0; > } > > +static inline int PageHeadHuge(struct page *page_head) > +{ > + return 0; > +} > + > static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) > { > } > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index af20b77..7111f2f 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -679,6 +679,23 @@ int PageHuge(struct page *page) > } > EXPORT_SYMBOL_GPL(PageHuge); > > +/* > + * PageHeadHuge() only returns true for hugetlbfs head page, but not for > + * normal or transparent huge pages. > + */ > +int PageHeadHuge(struct page *page_head) > +{ > + compound_page_dtor *dtor; > + > + if (!PageHead(page_head)) > + return 0; > + > + dtor = get_compound_page_dtor(page_head); > + > + return dtor == free_huge_page; > +} > +EXPORT_SYMBOL_GPL(PageHeadHuge); > + > pgoff_t __basepage_index(struct page *page) > { > struct page *page_head = compound_head(page); > diff --git a/mm/swap.c b/mm/swap.c > index f689e9a..a8feea6 100644 > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -77,18 +77,6 @@ static void __put_compound_page(struct page *page) > > static void put_compound_page(struct page *page) > { > - /* > - * hugetlbfs pages cannot be split from under us. If this is a > - * hugetlbfs page, check refcount on head page and release the page if > - * the refcount becomes zero. > - */ > - if (PageHuge(page)) { > - page = compound_head(page); > - if (put_page_testzero(page)) > - __put_compound_page(page); > - return; > - } > - > if (unlikely(PageTail(page))) { > /* __split_huge_page_refcount can run under us */ > struct page *page_head = compound_trans_head(page); > @@ -96,6 +84,35 @@ static void put_compound_page(struct page *page) > if (likely(page != page_head && > get_page_unless_zero(page_head))) { > unsigned long flags; > + > + if (PageHeadHuge(page_head)) { > + if (likely(PageTail(page))) { > + /* > + * __split_huge_page_refcount > + * cannot race here. > + */ > + VM_BUG_ON(!PageHead(page_head)); > + atomic_dec(&page->_mapcount); > + if (put_page_testzero(page_head)) > + VM_BUG_ON(1); > + if (put_page_testzero(page_head)) > + __put_compound_page(page_head); > + return; > + } else { > + /* > + * __split_huge_page_refcount > + * run before us, "page" was a > + * THP tail. The split > + * page_head has been freed > + * and reallocated as slab or > + * hugetlbfs page of smaller > + * order (only possible if > + * reallocated as slab on > + * x86). > + */ > + goto skip_lock; > + } > + } > /* > * page_head wasn't a dangling pointer but it > * may not be a head page anymore by the time > @@ -107,9 +124,29 @@ static void put_compound_page(struct page *page) > /* __split_huge_page_refcount run before us */ > compound_unlock_irqrestore(page_head, flags); > VM_BUG_ON(PageHead(page_head)); > - if (put_page_testzero(page_head)) > - __put_single_page(page_head); > - out_put_single: > +skip_lock: > + if (put_page_testzero(page_head)) { > + /* > + * The head page may have been > + * freed and reallocated as a > + * compound page of smaller > + * order and then freed again. > + * All we know is that it > + * cannot have become: a THP > + * page, a compound page of > + * higher order, a tail page. > + * That is because we still > + * hold the refcount of the > + * split THP tail and > + * page_head was the THP head > + * before the split. > + */ > + if (PageHead(page_head)) > + __put_compound_page(page_head); > + else > + __put_single_page(page_head); > + } > +out_put_single: > if (put_page_testzero(page)) > __put_single_page(page); > return; > @@ -173,21 +210,34 @@ bool __get_page_tail(struct page *page) > */ > unsigned long flags; > bool got = false; > - struct page *page_head; > + struct page *page_head = compound_trans_head(page); > > - /* > - * If this is a hugetlbfs page it cannot be split under us. Simply > - * increment refcount for the head page. > - */ > - if (PageHuge(page)) { > - page_head = compound_head(page); > - atomic_inc(&page_head->_count); > - got = true; > - goto out; > - } > - > - page_head = compound_trans_head(page); > if (likely(page != page_head && get_page_unless_zero(page_head))) { > + /* Ref to put_compound_page() comment. */ > + if (PageHeadHuge(page_head)) { > + if (likely(PageTail(page))) { > + /* > + * This is a hugetlbfs > + * page. __split_huge_page_refcount > + * cannot race here. > + */ > + VM_BUG_ON(!PageHead(page_head)); > + __get_page_tail_foll(page, false); > + return true; > + } else { > + /* > + * __split_huge_page_refcount run > + * before us, "page" was a THP > + * tail. The split page_head has been > + * freed and reallocated as slab or > + * hugetlbfs page of smaller order > + * (only possible if reallocated as > + * slab on x86). > + */ > + put_page(page_head); > + return false; > + } > + } > /* > * page_head wasn't a dangling pointer but it > * may not be a head page anymore by the time > @@ -204,7 +254,6 @@ bool __get_page_tail(struct page *page) > if (unlikely(!got)) > put_page(page_head); > } > -out: > return got; > } > EXPORT_SYMBOL(__get_page_tail); > -- > 1.7.10.4 > > -- > To unsubscribe from this list: send the line "unsubscribe stable" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html