memory_failure() doesn't handle thp itself at this time and need to split it before doing isolation. Currently thp is split in the middle of hwpoison_user_mappings(), but there're corner cases where memory_failure() wrongly tries to handle thp without splitting. 1) "non anonymous" thp, which is not a normal operating mode of thp, but a memory error could hit a thp before anon_vma is initialized. In such case, split_huge_page() fails and me_huge_page() (intended for hugetlb) is called for thp, which triggers BUG_ON in page_hstate(). 2) !PageLRU case, where hwpoison_user_mappings() returns with SWAP_SUCCESS and the result is the same as case 1. memory_failure() can't avoid splitting, so let's split it more earlier, which also reduces code which are prepared for both of normal page and thp. Signed-off-by: Naoya Horiguchi <n-horiguchi@xxxxxxxxxxxxx> --- mm/memory-failure.c | 88 +++++++++++++++-------------------------------------- 1 file changed, 25 insertions(+), 63 deletions(-) diff --git v4.1-rc3.orig/mm/memory-failure.c v4.1-rc3/mm/memory-failure.c index 9e9d04843d52..331c75b23ba2 100644 --- v4.1-rc3.orig/mm/memory-failure.c +++ v4.1-rc3/mm/memory-failure.c @@ -897,7 +897,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1, forcekill; struct page *hpage = *hpagep; - struct page *ppage; /* * Here we are interested only in user-mapped pages, so skip any @@ -947,59 +946,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* - * ppage: poisoned page - * if p is regular page(4k page) - * ppage == real poisoned page; - * else p is hugetlb or THP, ppage == head page. - */ - ppage = hpage; - - if (PageTransHuge(hpage)) { - /* - * Verify that this isn't a hugetlbfs head page, the check for - * PageAnon is just for avoid tripping a split_huge_page - * internal debug check, as split_huge_page refuses to deal with - * anything that isn't an anon page. PageAnon can't go away fro - * under us because we hold a refcount on the hpage, without a - * refcount on the hpage. split_huge_page can't be safely called - * in the first place, having a refcount on the tail isn't - * enough * to be safe. - */ - if (!PageHuge(hpage) && PageAnon(hpage)) { - if (unlikely(split_huge_page(hpage))) { - /* - * FIXME: if splitting THP is failed, it is - * better to stop the following operation rather - * than causing panic by unmapping. System might - * survive if the page is freed later. - */ - printk(KERN_INFO - "MCE %#lx: failed to split THP\n", pfn); - - BUG_ON(!PageHWPoison(p)); - return SWAP_FAIL; - } - /* - * We pinned the head page for hwpoison handling, - * now we split the thp and we are interested in - * the hwpoisoned raw page, so move the refcount - * to it. Similarly, page lock is shifted. - */ - if (hpage != p) { - if (!(flags & MF_COUNT_INCREASED)) { - put_page(hpage); - get_page(p); - } - lock_page(p); - unlock_page(hpage); - *hpagep = p; - } - /* THP is split, so ppage should be the real poisoned page. */ - ppage = p; - } - } - - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -1008,12 +954,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(ppage, ttu); + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(ppage)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1025,7 +971,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); @@ -1071,6 +1017,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page_state *ps; struct page *p; struct page *hpage; + struct page *orig_head; int res; unsigned int nr_pages; unsigned long page_flags; @@ -1086,7 +1033,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); - hpage = compound_head(p); + orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; @@ -1149,6 +1096,21 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } } + if (!PageHuge(p) && PageTransHuge(hpage)) { + if (!PageAnon(hpage)) { + pr_info("MCE: %#lx: non anonymous thp", pfn); + put_page(p); + return -EBUSY; + } + if (unlikely(split_huge_page(hpage))) { + pr_info("MCE: %#lx: thp split failed", pfn); + put_page(p); + return -EBUSY; + } + VM_BUG_ON_PAGE(!page_count(p), p); + hpage = compound_head(p); + } + /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others @@ -1158,9 +1120,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * walked by the page reclaim code, however that's not a big loss. */ if (!PageHuge(p)) { - if (!PageLRU(hpage)) - shake_page(hpage, 0); - if (!PageLRU(hpage)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { /* * shake_page could have turned it free. */ @@ -1181,7 +1143,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * The page could have changed compound pages during the locking. * If this happens just bail out. */ - if (compound_head(p) != hpage) { + if (PageCompound(p) && compound_head(p) != orig_head) { action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; goto out; -- 2.1.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href