From: Naoya Horiguchi <naoya.horiguchi@xxxxxxx> commit 25182f05ffed0b45602438693e4eed5d7f3ebadd upstream. When hugetlb page fault (under overcommitting situation) and memory_failure() race, VM_BUG_ON_PAGE() is triggered by the following race: CPU0: CPU1: gather_surplus_pages() page = alloc_surplus_huge_page() memory_failure_hugetlb() get_hwpoison_page(page) __get_hwpoison_page(page) get_page_unless_zero(page) zero = put_page_testzero(page) VM_BUG_ON_PAGE(!zero, page) enqueue_huge_page(h, page) put_page(page) __get_hwpoison_page() only checks the page refcount before taking an additional one for memory error handling, which is not enough because there's a time window where compound pages have non-zero refcount during hugetlb page initialization. So make __get_hwpoison_page() check page status a bit more for hugetlb pages with get_hwpoison_huge_page(). Checking hugetlb-specific flags under hugetlb_lock makes sure that the hugetlb page is not transitive. It's notable that another new function, HWPoisonHandlable(), is helpful to prevent a race against other transitive page states (like a generic compound page just before PageHuge becomes true). Link: https://lkml.kernel.org/r/20210603233632.2964832-2-nao.horiguchi@xxxxxxxxx Fixes: ead07f6a867b ("mm/memory-failure: introduce get_hwpoison_page() for consistent refcount handling") Signed-off-by: Naoya Horiguchi <naoya.horiguchi@xxxxxxx> Reported-by: Muchun Song <songmuchun@xxxxxxxxxxxxx> Acked-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Cc: Oscar Salvador <osalvador@xxxxxxx> Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Tony Luck <tony.luck@xxxxxxxxx> Cc: <stable@xxxxxxxxxxxxxxx> [5.12+] Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- include/linux/hugetlb.h | 6 ++++++ mm/hugetlb.c | 15 +++++++++++++++ mm/memory-failure.c | 29 +++++++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 2 deletions(-) --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -145,6 +145,7 @@ bool hugetlb_reserve_pages(struct inode long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_huge_page(struct page *page, struct list_head *list); +int get_hwpoison_huge_page(struct page *page, bool *hugetlb); void putback_active_hugepage(struct page *page); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); @@ -330,6 +331,11 @@ static inline bool isolate_huge_page(str return false; } +static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +{ + return 0; +} + static inline void putback_active_hugepage(struct page *page) { } --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5664,6 +5664,21 @@ unlock: return ret; } +int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +{ + int ret = 0; + + *hugetlb = false; + spin_lock_irq(&hugetlb_lock); + if (PageHeadHuge(page)) { + *hugetlb = true; + if (HPageFreed(page) || HPageMigratable(page)) + ret = get_page_unless_zero(page); + } + spin_unlock_irq(&hugetlb_lock); + return ret; +} + void putback_active_hugepage(struct page *page) { spin_lock(&hugetlb_lock); --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -949,6 +949,17 @@ static int page_action(struct page_state return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +/* + * Return true if a page type of a given page is supported by hwpoison + * mechanism (while handling could fail), otherwise false. This function + * does not return true for hugetlb or device memory pages, so it's assumed + * to be called only in the context where we never have such pages. + */ +static inline bool HWPoisonHandlable(struct page *page) +{ + return PageLRU(page) || __PageMovable(page); +} + /** * __get_hwpoison_page() - Get refcount for memory error handling: * @page: raw error page (hit by memory error) @@ -959,8 +970,22 @@ static int page_action(struct page_state static int __get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * This check prevents from calling get_hwpoison_unless_zero() + * for any unsupported type of page in order to reduce the risk of + * unexpected races caused by taking a page refcount. + */ + if (!HWPoisonHandlable(head)) + return 0; - if (!PageHuge(head) && PageTransHuge(head)) { + if (PageTransHuge(head)) { /* * Non anonymous thp exists only in allocation/free time. We * can't handle such a case correctly, so let's give it up. @@ -1017,7 +1042,7 @@ try_again: ret = -EIO; } } else { - if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { + if (PageHuge(p) || HWPoisonHandlable(p)) { ret = 1; } else { /*