On Thu, 24 Sep 2015 17:05:48 +0200 Vlastimil Babka <vbabka@xxxxxxx> wrote: > Yong Sun has reported the LTP futex_wake04 test to hang a s390x with our > kernel based on 3.12. This is reproducible on upstream 4.1.8 as well. 4.2+ > is OK thanks to removal of emulated hugepages, but we should do something > about the stable kernels here. > > The LTP test is a regression test for commit 13d60f4b6a ("futex: Take > hugepages into account when generating futex_key"), but it turns out that it's > sufficient to just attempt to wait for a single futex on a tail page of a > hugetlbfs page: The software emulated large pages always have been a bit fragile. > The problem is an endless loop in get_futex_key() when > CONFIG_TRANSPARENT_HUGEPAGE is enabled and the s390x machine has emulated > hugepages. The code tries to serialize against __split_huge_page_splitting(), > but __get_user_pages_fast() fails on the hugetlbfs tail page. This happens > because pmd_large() is false for emulated hugepages, so the code will proceed > into gup_pte_range() and fail page_cache_get_speculative() through failing > get_page_unless_zero() as the tail page count is zero. Failing __gup_fast is > supposed to be temporary due to a race, so get_futex_key() will try again > endlessly. > > This attempt for a fix is a bandaid solution and probably incomplete. > Hopefully something better will emerge from the discussion. Fully fixing > emulated hugepages just for stable backports is unlikely due to them being > removed. Also THP refcounting redesign should soon remove the trickery from > get_futex_key(). > > This patch relies on the fact that s390x with emulated hugepages returns false > in has_transparent_hugepage(), so we don't need to do the serialization > trickery and just use the code for !CONFIG_TRANSPARENT_HUGEPAGE. We just need > an extra variable to cache the result of has_transparent_hugepage(), which is > __init and potentially expensive on some architectures. > > However, __get_user_pages_fast() is still broken. The get_user_pages_fast() > wrapper will hide this in the common case. The other user of the __ variant > is kvm, which is mentioned as the reason for removal of emulated hugepages. > The call of page_cache_get_speculative() looks also broken in this scenario > on debug builds because of VM_BUG_ON_PAGE(PageTail(page), page). With > CONFIG_TINY_RCU enabled, there's plain atomic_inc(&page->_count) which also > probably shouldn't happen for a tail page... It boils down to __get_user_pages_fast being broken for emulated large pages, doesn't it? My preferred fix would be to get __get_user_page_fast to work in this case. For 3.12 a patch would look like this (needs more testing though): -- diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index bb0c157..5948b7f 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -370,7 +370,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) #define _SEGMENT_ENTRY_LARGE 0x400 /* STE-format control, large page */ -#define _SEGMENT_ENTRY_CO 0x100 /* change-recording override */ +#define _SEGMENT_ENTRY_SWLARGE 0x100 /* SW large page bit */ #define _SEGMENT_ENTRY_SPLIT 0x001 /* THP splitting bit */ #define _SEGMENT_ENTRY_YOUNG 0x002 /* SW segment young bit */ #define _SEGMENT_ENTRY_NONE _SEGMENT_ENTRY_YOUNG @@ -391,8 +391,8 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_SPLIT_BIT 0 /* THP splitting bit number */ /* Set of bits not changed in pmd_modify */ -#define _SEGMENT_CHG_MASK (_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \ - | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_CO) +#define _SEGMENT_CHG_MASK (_SEGMENT_ENTRY_ORIGIN | _SEGMENT_ENTRY_LARGE \ + | _SEGMENT_ENTRY_SPLIT | _SEGMENT_ENTRY_SWLARGE) /* Page status table bits for virtualization */ #define PGSTE_ACC_BITS 0xf000000000000000UL @@ -563,12 +563,25 @@ static inline int pmd_none(pmd_t pmd) static inline int pmd_large(pmd_t pmd) { #ifdef CONFIG_64BIT - return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; + return (pmd_val(pmd) & + (_SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SWLARGE)) != 0; #else return 0; #endif } +static inline pmd_t pmd_swlarge_deref(pmd_t pmd) +{ + unsigned long origin; + + if (pmd_val(pmd) & _SEGMENT_ENTRY_SWLARGE) { + origin = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN; + pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN; + pmd_val(pmd) |= *(unsigned long *) origin; + } + return pmd; +} + static inline int pmd_prot_none(pmd_t pmd) { return (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) && @@ -578,8 +591,10 @@ static inline int pmd_prot_none(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { #ifdef CONFIG_64BIT - if (pmd_large(pmd)) + if (pmd_large(pmd)) { + pmd = pmd_swlarge_deref(pmd); return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0; + } #endif return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; } @@ -1495,8 +1510,6 @@ static inline int pmd_trans_splitting(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t entry) { - if (!(pmd_val(entry) & _SEGMENT_ENTRY_INVALID) && MACHINE_HAS_EDAT1) - pmd_val(entry) |= _SEGMENT_ENTRY_CO; *pmdp = entry; } diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 46d517c..e159735 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -129,7 +129,7 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, } #ifdef CONFIG_64BIT -#define _PMD_PROT_MASK (_SEGMENT_ENTRY_PROTECT | _SEGMENT_ENTRY_CO) +#define _PMD_PROT_MASK (_SEGMENT_ENTRY_PROTECT) #else #define _PMD_PROT_MASK 0 #endif @@ -138,7 +138,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t *pud, unsigned long addr) { unsigned int prot; - pmd_t *pmd; + pmd_t *pmd, pmd_val; int i; for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++) { @@ -146,7 +146,8 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pmd = pmd_offset(pud, addr); if (!pmd_none(*pmd)) { if (pmd_large(*pmd)) { - prot = pmd_val(*pmd) & _PMD_PROT_MASK; + pmd_val = pmd_swlarge_deref(*pmd); + prot = pmd_val(pmd_val) & _PMD_PROT_MASK; note_page(m, st, prot, 3); } else walk_pte_level(m, st, pmd, addr); diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 5d758db..8dd86a5 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -48,8 +48,9 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 1; } -static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) +static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd_orig, pmd_t pmd, + unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) { unsigned long mask, result; struct page *head, *page, *tail; @@ -78,7 +79,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 0; } - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { + if (unlikely(pmd_val(pmd_orig) != pmd_val(*pmdp))) { *nr -= refs; while (refs--) put_page(head); @@ -103,7 +104,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; - pmd_t *pmdp, pmd; + pmd_t *pmdp, pmd, pmd_orig; pmdp = (pmd_t *) pudp; #ifdef CONFIG_64BIT @@ -112,7 +113,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, pmdp += pmd_index(addr); #endif do { - pmd = *pmdp; + pmd = pmd_orig = *pmdp; barrier(); next = pmd_addr_end(addr, end); /* @@ -127,8 +128,9 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { - if (!gup_huge_pmd(pmdp, pmd, addr, next, - write, pages, nr)) + if (!gup_huge_pmd(pmdp, pmd_orig, + pmd_swlarge_deref(pmd), + addr, next, write, pages, nr)) return 0; } else if (!gup_pte_range(pmdp, pmd, addr, next, write, pages, nr)) diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 99a68d5..d4a17d9 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -98,22 +98,18 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, if (!MACHINE_HAS_HPAGE) { pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN; pmd_val(pmd) |= pte_page(pte)[1].index; + pmd_val(pmd) |= _SEGMENT_ENTRY_SWLARGE; } else - pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_CO; + pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE; *(pmd_t *) ptep = pmd; } pte_t huge_ptep_get(pte_t *ptep) { - unsigned long origin; pmd_t pmd; pmd = *(pmd_t *) ptep; - if (!MACHINE_HAS_HPAGE && pmd_present(pmd)) { - origin = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN; - pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN; - pmd_val(pmd) |= *(unsigned long *) origin; - } + pmd = pmd_swlarge_deref(pmd); return __pmd_to_pte(pmd); } diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index bcfb70b..a4f6f2f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -235,8 +235,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) if (!new_page) goto out; pmd_val(*pm_dir) = __pa(new_page) | - _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE | - _SEGMENT_ENTRY_CO; + _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE; address = (address + PMD_SIZE) & PMD_MASK; continue; } -- blue skies, Martin. "Reality continues to ruin my life." - Calvin. -- To unsubscribe from this list: send the line "unsubscribe linux-s390" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html