In order to convert various architectures to generic tlb we need to provide some extra infrastructure to track the range of the flushed page tables. There are two mmu_gather cases to consider: unmap_region() tlb_gather_mmu() unmap_vmas() for (; vma; vma = vma->vm_next) unmap_page_range() tlb_start_vma() -> flush cache range/track vm_flags zap_*_range() arch_enter_lazy_mmu_mode() ptep_get_and_clear_full() -> batch/track external tlbs tlb_remove_tlb_entry() -> track range/external tlbs tlb_remove_page() -> batch page arch_lazy_leave_mmu_mode() -> flush external tlbs tlb_end_vma() free_pgtables() while (vma) unlink_*_vma() free_*_range() *_free_tlb() -> track range/batch page tlb_finish_mmu() -> flush TLBs and flush everything free vmas and: shift_arg_pages() tlb_gather_mmu() free_*_range() *_free_tlb() -> track tlb range tlb_finish_mmu() -> flush things There are various reasons that we need to flush TLBs _after_ tearing down the page-tables themselves. For some architectures (x86 among others) this serializes against (both hardware and software) page table walkers like gup_fast(). For others (ARM) this is (also) needed to evict stale page-table caches - ARM LPAE mode apparently caches page tables and concurrent hardware walkers could re-populate these caches if the final tlb flush were to be from tlb_end_vma() since an concurrent walk could still be in progress. So implement generic range tracking over both clearing the PTEs and tearing down the page-tables. Cc: Russell King <rmk@xxxxxxxxxxxxxxxx> Cc: Tony Luck <tony.luck@xxxxxxxxx> Cc: Paul Mundt <lethal@xxxxxxxxxxxx> Cc: Jeff Dike <jdike@xxxxxxxxxxx> Cc: Richard Weinberger <richard@xxxxxx> Cc: Hans-Christian Egtvedt <hans-christian.egtvedt@xxxxxxxxx> Cc: Ralf Baechle <ralf@xxxxxxxxxxxxxx> Cc: Kyle McMartin <kyle@xxxxxxxxxxx> Cc: James Bottomley <jejb@xxxxxxxxxxxxxxxx> Cc: David Miller <davem@xxxxxxxxxxxxx> Cc: Chris Zankel <chris@xxxxxxxxxx> Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> --- arch/Kconfig | 3 include/asm-generic/tlb.h | 193 ++++++++++++++++++++++++++++++++++++++++++---- mm/memory.c | 3 3 files changed, 185 insertions(+), 14 deletions(-) --- a/arch/Kconfig +++ b/arch/Kconfig @@ -244,6 +244,9 @@ config HAVE_HW_PAGE_TABLE_WALKS linux page-table structure. Therefore we don't need to emit hardware TLB flush instructions before freeing page-table pages. +config HAVE_MMU_GATHER_RANGE + bool + config ARCH_HAVE_NMI_SAFE_CMPXCHG bool --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -5,12 +5,77 @@ * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * - * Copyright 2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@xxxxxxxxxx> + * Copyright 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@xxxxxxxxxx> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * This generic implementation tries to cover all TLB invalidate needs + * across our archicture spectrum, please ask before adding a new arch + * specific mmu_gather implementation. + * + * The TLB shootdown code deals with all the fun races an SMP system bring + * to the otherwise simple task of unmapping and freeing pages. + * + * There are two mmu_gather cases to consider, the below shows the various + * hooks and how this implementation employs them: + * + * unmap_region() + * tlb_gather_mmu() + * unmap_vmas() + * for (; vma; vma = vma->vm_next) + * unmap_page_range() + * tlb_start_vma() -> flush cache range/track vm_flags + * zap_*_range() + * arch_enter_lazy_mmu_mode() + * ptep_get_and_clear_full() -> batch/track external tlbs + * tlb_remove_tlb_entry() -> track range/external tlbs + * tlb_remove_page() -> batch page + * arch_leave_lazy_mmu_mode() -> flush external tlbs + * tlb_end_vma() + * free_pgtables() + * while (vma) + * unlink_*_vma() + * free_*_range() + * *_free_tlb() -> track range/batch page + * tlb_finish_mmu() -> flush TLBs and pages + * free vmas + * + * and: + * + * shift_arg_pages() + * tlb_gather_mmu() + * free_*_range() + * *_free_tlb() -> track range/batch page + * tlb_finish_mmu() -> flush TLBs and pages + * + * This code has 3 relevant Kconfig knobs: + * + * CONFIG_HAVE_MMU_GATHER_RANGE -- In case the architecture has an efficient + * flush_tlb_range() implementation this adds range tracking to the + * mmu_gather and avoids full mm invalidation where possible. + * + * There's a number of curious details wrt passing a vm_area_struct, see + * our tlb_start_vma() implementation. + * + * CONFIG_HAVE_RCU_TABLE_FREE -- In case flush_tlb_*() doesn't + * serialize software walkers against page-table tear-down. This option + * enables a semi-RCU freeing of page-tables such that disabling IRQs + * will still provide the required serialization. See the big comment + * a page or so down. + * + * CONFIG_HAVE_HW_PAGE_TABLE_WALKS -- Optimization for architectures with + * 'external' hash-table MMUs and similar which don't require a TLB + * invalidate before freeing page-tables, always used in conjunction + * with CONFIG_HAVE_RCU_TABLE_FREE to provide proper serialization for + * software page-table walkers. + * + * For instance SPARC64 and PPC use arch_{enter,leave}_lazy_mmu_mode() + * toghether with ptep_get_and_clear_full() to wipe their hash-table. + * + * See arch/Kconfig for more details. */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H @@ -37,7 +102,8 @@ struct mmu_gather_batch { #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) -/* struct mmu_gather is an opaque type used by the mm code for passing around +/* + * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { @@ -45,6 +111,10 @@ struct mmu_gather { #ifdef CONFIG_HAVE_RCU_TABLE_FREE struct mmu_table_batch *batch; #endif +#ifdef CONFIG_HAVE_MMU_GATHER_RANGE + unsigned long start, end; + unsigned long vm_flags; +#endif unsigned int need_flush : 1, /* Did free PTEs */ fast_mode : 1; /* No batching */ @@ -83,6 +153,16 @@ struct mmu_gather { * pressure. To guarantee progress we fall back to single table freeing, see * the implementation of tlb_remove_table_one(). * + * When this option is selected, the arch is expected to use: + * + * void tlb_remove_table(struct mmu_gather *tlb, void *table) + * + * to 'free' page-tables from their respective __{pte,pmd,pud}_free_tlb() + * implementations and has to provide an implementation of: + * + * void __tlb_remove_table(void *); + * + * that actually does the free. */ struct mmu_table_batch { struct rcu_head rcu; @@ -118,8 +198,90 @@ static inline void tlb_remove_table(stru #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ +void tlb_flush_mmu(struct mmu_gather *tlb); + #define HAVE_GENERIC_MMU_GATHER +#ifdef CONFIG_HAVE_MMU_GATHER_RANGE + +static inline void tlb_range_init(struct mmu_gather *tlb) +{ + tlb->start = TASK_SIZE; + tlb->end = 0; + tlb->vm_flags = 0; +} + +static inline void +tlb_track_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end) +{ + if (!tlb->fullmm) { + tlb->start = min(tlb->start, addr); + tlb->end = max(tlb->end, end); + } +} + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + /* + * Fake VMA, some architectures use VM_EXEC to flush I-TLB/I$, + * and some use VM_HUGETLB since they have separate HPAGE TLBs. + */ + struct vm_area_struct vma = { + .vm_mm = tlb->mm, + .vm_flags = tlb->vm_flags, + }; + + flush_tlb_range(&vma, tlb->start, tlb->end); + tlb_range_init(tlb); +} + +static inline void +tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + if (tlb->fullmm) + return; + + /* + * flush_tlb_range() implementations that look at VM_HUGETLB + * (tile, mips-r4k) flush only large pages, so force flush on + * VM_HUGETLB vma boundaries. + */ + if ((tlb->vm_flags & VM_HUGETLB) != (vma->vm_flags & VM_HUGETLB)) + tlb_flush_mmu(tlb); + + /* + * flush_tlb_range() implementations that flush I-TLB also flush + * D-TLB (tile, extensa, arm), so its ok to just add VM_EXEC to + * an existing range. + */ + tlb->vm_flags |= vma->vm_flags & (VM_EXEC|VM_HUGETLB); + + flush_cache_range(vma, vma->vm_start, vma->vm_end); +} + +static inline void +tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ +} + +#else /* CONFIG_HAVE_MMU_GATHER_RANGE */ + +static inline void tlb_range_init(struct mmu_gather *tlb) +{ +} + +/* + * Macro avoids argument evaluation. + */ +#define tlb_track_range(tlb, addr, end) do { } while (0) + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); +} + +#endif /* CONFIG_HAVE_MMU_GATHER_RANGE */ + static inline int tlb_fast_mode(struct mmu_gather *tlb) { #ifdef CONFIG_SMP @@ -134,7 +296,6 @@ static inline int tlb_fast_mode(struct m } void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm); -void tlb_flush_mmu(struct mmu_gather *tlb); void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end); int __tlb_remove_page(struct mmu_gather *tlb, struct page *page); @@ -155,10 +316,11 @@ static inline void tlb_remove_page(struc * later optimise away the tlb invalidate. This helps when userspace is * unmapping already-unmapped pages, which happens quite a lot. */ -#define tlb_remove_tlb_entry(tlb, ptep, address) \ +#define tlb_remove_tlb_entry(tlb, ptep, addr) \ do { \ tlb->need_flush = 1; \ - __tlb_remove_tlb_entry(tlb, ptep, address); \ + tlb_track_range(tlb, addr, addr + PAGE_SIZE); \ + __tlb_remove_tlb_entry(tlb, ptep, addr); \ } while (0) /** @@ -175,26 +337,31 @@ static inline void tlb_remove_page(struc __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) -#define pte_free_tlb(tlb, ptep, address, end) \ +#define pte_free_tlb(tlb, ptep, addr, end) \ do { \ tlb->need_flush = 1; \ - __pte_free_tlb(tlb, ptep, address); \ + tlb_track_range(tlb, addr, end); \ + __pte_free_tlb(tlb, ptep, addr); \ } while (0) -#ifndef __ARCH_HAS_4LEVEL_HACK -#define pud_free_tlb(tlb, pudp, address, end) \ +#define pmd_free_tlb(tlb, pmdp, addr, end) \ do { \ tlb->need_flush = 1; \ - __pud_free_tlb(tlb, pudp, address); \ + tlb_track_range(tlb, addr, end); \ + __pmd_free_tlb(tlb, pmdp, addr); \ } while (0) -#endif -#define pmd_free_tlb(tlb, pmdp, address, end) \ +#ifndef __ARCH_HAS_4LEVEL_HACK +#define pud_free_tlb(tlb, pudp, addr, end) \ do { \ tlb->need_flush = 1; \ - __pmd_free_tlb(tlb, pmdp, address); \ + tlb_track_range(tlb, addr, end); \ + __pud_free_tlb(tlb, pudp, addr); \ } while (0) +#endif +#ifndef tlb_migrate_finish #define tlb_migrate_finish(mm) do {} while (0) +#endif #endif /* _ASM_GENERIC__TLB_H */ --- a/mm/memory.c +++ b/mm/memory.c @@ -214,6 +214,7 @@ void tlb_gather_mmu(struct mmu_gather *t tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; + tlb_range_init(tlb); tlb_table_init(tlb); if (fullmm) { @@ -228,7 +229,7 @@ void tlb_flush_mmu(struct mmu_gather *tl if (!tlb->fullmm && tlb->need_flush) { tlb->need_flush = 0; - flush_tlb_mm(tlb->mm); + tlb_flush(tlb); } tlb_table_flush(tlb); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>