Scattered across the archs are 3 basic forms of tlb_{start,end}_vma(). Provide two new MMU_GATHER_knobs to enumerate them and remove the per arch tlb_{start,end}_vma() implementations. - MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range() but does *NOT* want to call it for each VMA. - MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the invalidate across multiple VMAs if possible. With these it is possible to capture the three forms: 1) empty stubs; select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS 2) start: flush_cache_range(), end: empty; select MMU_GATHER_MERGE_VMAS 3) start: flush_cache_range(), end: flush_tlb_range(); default Obviously, if the architecture does not have flush_cache_range() then it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE. Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> --- arch/Kconfig | 7 +++++++ arch/csky/include/asm/tlb.h | 13 ------------- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/tlb.h | 10 ---------- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/tlb.h | 2 -- arch/s390/Kconfig | 1 + arch/s390/include/asm/tlb.h | 3 --- arch/sparc/Kconfig | 2 ++ arch/sparc/include/asm/tlb_64.h | 2 -- arch/x86/Kconfig | 1 + arch/x86/include/asm/tlb.h | 3 --- include/asm-generic/tlb.h | 21 +++++++++++++++++++-- 13 files changed, 32 insertions(+), 35 deletions(-) --- a/arch/Kconfig +++ b/arch/Kconfig @@ -438,6 +438,13 @@ config MMU_GATHER_PAGE_SIZE config MMU_GATHER_NO_RANGE bool + select MMU_GATHER_MERGE_VMAS + +config MMU_GATHER_NO_FLUSH_CACHE + bool + +config MMU_GATHER_MERGE_VMAS + bool config MMU_GATHER_NO_GATHER bool --- a/arch/csky/include/asm/tlb.h +++ b/arch/csky/include/asm/tlb.h @@ -4,19 +4,6 @@ #define __ASM_CSKY_TLB_H #include <asm/cacheflush.h> - -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \ - } while (0) - #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) #include <asm-generic/tlb.h> --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -112,6 +112,7 @@ config LOONGARCH select TRACE_IRQFLAGS_SUPPORT select USE_PERCPU_NUMA_NODE_ID select ZONE_DMA32 + select MMU_GATHER_MERGE_VMAS if MMU config 32BIT bool --- a/arch/loongarch/include/asm/tlb.h +++ b/arch/loongarch/include/asm/tlb.h @@ -137,16 +137,6 @@ static inline void invtlb_all(u32 op, u3 ); } -/* - * LoongArch doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for area to be unmapped. - */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!(tlb)->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) static void tlb_flush(struct mmu_gather *tlb); --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -256,6 +256,7 @@ config PPC select IRQ_FORCED_THREADING select MMU_GATHER_PAGE_SIZE select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -19,8 +19,6 @@ #include <linux/pagemap.h> -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry #define tlb_flush tlb_flush --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -204,6 +204,7 @@ config S390 select IOMMU_SUPPORT if PCI select MMU_GATHER_NO_GATHER select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PCI select NEED_SG_DMA_LENGTH if PCI --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -27,9 +27,6 @@ static inline void tlb_flush(struct mmu_ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb #define pmd_free_tlb pmd_free_tlb --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -67,6 +67,8 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select MMU_GATHER_RCU_TABLE_FREE if SMP + select MMU_GATHER_MERGE_VMAS + select MMU_GATHER_NO_FLUSH_CACHE select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -22,8 +22,6 @@ void smp_flush_tlb_mm(struct mm_struct * void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *); void flush_tlb_pending(void); -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) #define tlb_flush(tlb) flush_tlb_pending() /* --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -245,6 +245,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -2,9 +2,6 @@ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -158,9 +158,24 @@ * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * + * MMU_GATHER_NO_FLUSH_CACHE + * + * Indicates the architecture has flush_cache_range() but it needs *NOT* be called + * before unmapping a VMA. + * + * NOTE: strictly speaking we shouldn't have this knob and instead rely on + * flush_cache_range() being a NOP, except Sparc64 seems to be + * different here. + * + * MMU_GATHER_MERGE_VMAS + * + * Indicates the architecture wants to merge ranges over VMAs; typical when + * multiple range invalidates are more expensive than a full invalidate. + * * MMU_GATHER_NO_RANGE * - * Use this if your architecture lacks an efficient flush_tlb_range(). + * Use this if your architecture lacks an efficient flush_tlb_range(). This + * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * @@ -493,14 +508,16 @@ static inline void tlb_start_vma(struct return; tlb_update_vma_flags(tlb, vma); +#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE flush_cache_range(vma, vma->vm_start, vma->vm_end); +#endif } #endif #ifndef tlb_end_vma static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) { - if (tlb->fullmm) + if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) return; /*