On Fri, Aug 24, 2018 at 10:35:56AM +0200, Peter Zijlstra wrote: > Anyway, its sorted now; although I'd like to write me a fairly big > comment in asm-generic/tlb.h about things, before I forget again. How's something like so? There's a little page_size thingy in this; mostly because I couldn't be arsed to split it for now. Will has opinions on the page_size thing; I'll let him explain. --- arch/Kconfig | 3 + arch/arm/include/asm/tlb.h | 3 +- arch/ia64/include/asm/tlb.h | 3 +- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/tlb.h | 17 ------ arch/s390/include/asm/tlb.h | 4 +- arch/sh/include/asm/tlb.h | 4 +- arch/um/include/asm/tlb.h | 4 +- include/asm-generic/tlb.h | 130 ++++++++++++++++++++++++++++++++++++----- mm/huge_memory.c | 4 +- mm/hugetlb.c | 2 +- mm/madvise.c | 2 +- mm/memory.c | 9 ++- 13 files changed, 137 insertions(+), 49 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 6801123932a5..053c44703539 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -365,6 +365,9 @@ config HAVE_RCU_TABLE_FREE config HAVE_RCU_TABLE_INVALIDATE bool +config HAVE_MMU_GATHER_PAGE_SIZE + bool + config ARCH_HAVE_NMI_SAFE_CMPXCHG bool diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f854148c8d7c..d644c3c7c6f3 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -286,8 +286,7 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr #define tlb_migrate_finish(mm) do { } while (0) -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, +static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { } diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 516355a774bf..bf8985f5f876 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -282,8 +282,7 @@ do { \ #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, +static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index db0b6eebbfa5..4db1072868f7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -217,6 +217,7 @@ config PPC select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP + select HAVE_MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index f0e571b2dc7c..b29a67137acf 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -27,7 +27,6 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change extern void tlb_flush(struct mmu_gather *tlb); @@ -46,22 +45,6 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ - if (!tlb->page_size) - tlb->page_size = page_size; - else if (tlb->page_size != page_size) { - if (!tlb->fullmm) - tlb_flush_mmu(tlb); - /* - * update the page size after flush for the new - * mmu_gather. - */ - tlb->page_size = page_size; - } -} - #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 457b7ba0fbb6..cf3d64313740 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -180,9 +180,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) +static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { } diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 77abe192fb43..af7c9d891cf8 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -127,9 +127,7 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, return tlb_remove_page(tlb, page); } -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) +static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { } diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index dce6db147f24..6463f3ab1767 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -146,9 +146,7 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) +static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index b3353e21f3b3..d3573ba10068 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -20,6 +20,108 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> +/* + * Generic MMU-gather implementation. + * + * The mmu_gather data structure is used by the mm code to implement the + * correct and efficient ordering of freeing pages and TLB invalidations. + * + * This correct ordering is: + * + * 1) unhook page + * 2) TLB invalidate page + * 3) free page + * + * That is, we must never free a page before we have ensured there are no live + * translations left to it. Otherwise it might be possible to observe (or + * worse, change) the page content after it has been reused. + * + * The mmu_gather API consists of: + * + * - tlb_gather_mmu() / tlb_finish_mmu(); start and finish a mmu_gather + * + * Finish in particular will issue a (final) TLB invalidate and free + * all (remaining) queued pages. + * + * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA + * + * Defaults to flushing at tlb_end_vma() to reset the range; helps when + * there's large holes between the VMAs. + * + * - tlb_remove_page() / __tlb_remove_page() + * - tlb_remove_page_size() / __tlb_remove_page_size() + * + * __tlb_remove_page_size() is the basic primitive that queues a page for + * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a + * boolean indicating if the queue is (now) full and a call to + * tlb_flush_mmu() is required. + * + * tlb_remove_page() and tlb_remove_page_size() imply the call to + * tlb_flush_mmu() when required and has no return value. + * + * - tlb_change_page_size() + * + * call before __tlb_remove_page*() to set the current page-size; implies a + * possible tlb_flush_mmu() call. + * + * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() / tlb_flush_mmu_free() + * + * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets + * related state, like the range) + * + * tlb_flush_mmu_free() - frees the queued pages; make absolutely + * sure no additional tlb_remove_page() + * calls happen between _tlbonly() and this. + * + * tlb_flush_mmu() - the above two calls. + * + * - mmu_gather::fullmm + * + * A flag set by tlb_gather_mmu() to indicate we're going to free + * the entire mm; this allows a number of optimizations. + * + * XXX list optimizations + * + * - mmu_gather::need_flush_all + * + * A flag that can be set by the arch code if it wants to force + * flush the entire TLB irrespective of the range. For instance + * x86-PAE needs this when changing top-level entries. + * + * And requires the architecture to provide and implement tlb_flush(). + * + * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make + * use of: + * + * - mmu_gather::start / mmu_gather::end + * + * which (when !need_flush_all; fullmm will have start = end = ~0UL) provides + * the range that needs to be flushed to cover the pages to be freed. + * + * Additionally there are a few opt-in features: + * + * HAVE_MMU_GATHER_PAGE_SIZE + * + * This ensures we call tlb_flush() every time tlb_change_page_size() actually + * changes the size and provides mmu_gather::page_size to tlb_flush(). + * + * HAVE_RCU_TABLE_FREE + * + * This provides tlb_remove_table(), to be used instead of tlb_remove_page() + * for page directores (__p*_free_tlb()). This provides separate freeing of + * the page-table pages themselves in a semi-RCU fashion (see comment below). + * Useful if your architecture doesn't use IPIs for remote TLB invalidates + * and therefore doesn't naturally serialize with software page-table walkers. + * + * HAVE_RCU_TABLE_INVALIDATE + * + * This makes HAVE_RCU_TABLE_FREE call tlb_flush_mmu_tlbonly() before freeing + * the page-table pages. Required if you use HAVE_RCU_TABLE_FREE and your + * architecture uses the Linux page-tables natively. + * + */ +#define HAVE_GENERIC_MMU_GATHER + #ifdef CONFIG_HAVE_RCU_TABLE_FREE /* * Semi RCU freeing of the page directories. @@ -87,14 +189,17 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) -/* struct mmu_gather is an opaque type used by the mm code for passing around +/* + * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; + #ifdef CONFIG_HAVE_RCU_TABLE_FREE struct mmu_table_batch *batch; #endif + unsigned long start; unsigned long end; /* we are in the middle of an operation to clear @@ -103,15 +208,17 @@ struct mmu_gather { /* we have performed an operation which * requires a complete flush of the tlb */ need_flush_all : 1; +#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE + unsigned int page_size; +#endif struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; unsigned int batch_count; - int page_size; + }; -#define HAVE_GENERIC_MMU_GATHER void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -170,21 +277,18 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return tlb_remove_page_size(tlb, page, PAGE_SIZE); } -#ifndef tlb_remove_check_page_size_change -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, +static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { - /* - * We don't care about page size change, just update - * mmu_gather page size here so that debug checks - * doesn't throw false warning. - */ -#ifdef CONFIG_DEBUG_VM +#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE + if (tlb->page_size && tlb->page_size != page_size) { + if (!tlb->fullmm) + tlb_flush_mmu(tlb); + } + tlb->page_size = page_size; #endif } -#endif /* * In the case of tlb vma handling, we can optimise these away in the diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 08b544383d74..786758670ba1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1617,7 +1617,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; bool ret = false; - tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -1693,7 +1693,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; spinlock_t *ptl; - tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c21775f196b..8af346b53a79 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3337,7 +3337,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * This is a hugetlb vma, all the pte entries should point * to huge page. */ - tlb_remove_check_page_size_change(tlb, sz); + tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; diff --git a/mm/madvise.c b/mm/madvise.c index 4d3c922ea1a1..2a9073c652d2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -328,7 +328,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); diff --git a/mm/memory.c b/mm/memory.c index 83aef222f11b..2818e00d1aae 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -233,7 +233,9 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif +#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE tlb->page_size = 0; +#endif __tlb_reset_range(tlb); } @@ -294,7 +296,10 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); + +#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); +#endif batch = tlb->active; /* @@ -602,7 +607,7 @@ void free_pgd_range(struct mmu_gather *tlb, * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -1293,7 +1298,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *pte; swp_entry_t entry; - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);