On Tue, May 16 2023 at 09:27, Russell King wrote: > On Tue, May 16, 2023 at 10:20:37AM +0200, Thomas Gleixner wrote: >> On Tue, May 16 2023 at 10:18, Thomas Gleixner wrote: >> >> > On Tue, May 16 2023 at 08:37, Thomas Gleixner wrote: >> >> On Mon, May 15 2023 at 22:31, Russell King wrote: >> >>>> + list_for_each_entry(va, list, list) { >> >>>> + /* flush range by one by one 'invlpg' */ >> >>>> + for (addr = va->va_start; addr < va->va_end; addr += PAGE_SIZE) >> >>>> + flush_tlb_one_kernel(addr); >> >>> >> >>> Isn't this just the same as: >> >>> flush_tlb_kernel_range(va->va_start, va->va_end); >> >> >> >> Indeed. >> > >> > Actually not. At least not on x86 where it'd end up with 3 IPIs for that >> > case again, instead of having one which walks the list on each CPU. >> >> ARM32 has the same problem when tlb_ops_need_broadcast() is true. > > If tlb_ops_need_broadcast() is true, then isn't it one IPI to other > CPUs to flush the range, and possibly another for the Cortex-A15 > erratum? > > I've no idea what flush_tlb_one_kernel() is. I can find no such The patch is against x86 and that function exists there. At lease git grep claims so. :) > implementation, there is flush_tlb_kernel_page() though, which I > think is what you're referring to above. On ARM32, that will issue > one IPI each time it's called, and possibly another IPI for the > Cortex-A15 erratum. > > Given that, flush_tlb_kernel_range() is still going to be more > efficient on ARM32 when tlb_ops_need_broadcast() is true than doing > it page by page. Something like the untested below? I did not attempt anything to decide whether a full flush might be worth it, but that's a separate problem. Thanks, tglx --- --- a/arch/Kconfig +++ b/arch/Kconfig @@ -270,6 +270,10 @@ config ARCH_HAS_SET_MEMORY config ARCH_HAS_SET_DIRECT_MAP bool +# Select if architecture provides flush_tlb_kernel_vas() +config ARCH_HAS_FLUSH_TLB_KERNEL_VAS + bool + # # Select if the architecture provides the arch_dma_set_uncached symbol to # either provide an uncached segment alias for a DMA allocation, or --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -10,6 +10,7 @@ config ARM select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_FLUSH_TLB_KERNEL_VAS select ARCH_HAS_KEEPINITRD select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE --- a/arch/arm/kernel/smp_tlb.c +++ b/arch/arm/kernel/smp_tlb.c @@ -7,6 +7,7 @@ #include <linux/preempt.h> #include <linux/smp.h> #include <linux/uaccess.h> +#include <linux/vmalloc.h> #include <asm/smp_plat.h> #include <asm/tlbflush.h> @@ -69,6 +70,19 @@ static inline void ipi_flush_tlb_kernel_ local_flush_tlb_kernel_range(ta->ta_start, ta->ta_end); } +static inline void local_flush_tlb_kernel_vas(struct list_head *vmap_list) +{ + struct vmap_area *va; + + list_for_each_entry(va, vmap_list, list) + local_flush_tlb_kernel_range(va->va_start, va->va_end); +} + +static inline void ipi_flush_tlb_kernel_vas(void *arg) +{ + local_flush_tlb_kernel_vas(arg); +} + static inline void ipi_flush_bp_all(void *ignored) { local_flush_bp_all(); @@ -244,6 +258,15 @@ void flush_tlb_kernel_range(unsigned lon broadcast_tlb_a15_erratum(); } +void flush_tlb_kernel_vas(struct list_head *vmap_list, unsigned long num_entries) +{ + if (tlb_ops_need_broadcast()) { + on_each_cpu(ipi_flush_tlb_kernel_vas, vmap_list, 1); + } else + local_flush_tlb_kernel_vas(vmap_list); + broadcast_tlb_a15_erratum(); +} + void flush_bp_all(void) { if (tlb_ops_need_broadcast()) --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -77,6 +77,7 @@ config X86 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_FLUSH_TLB_KERNEL_VAS select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -10,6 +10,7 @@ #include <linux/debugfs.h> #include <linux/sched/smt.h> #include <linux/task_work.h> +#include <linux/vmalloc.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -1081,6 +1082,27 @@ void flush_tlb_kernel_range(unsigned lon } } +static void do_flush_tlb_vas(void *arg) +{ + struct list_head *vmap_list = arg; + struct vmap_area *va; + unsigned long addr; + + list_for_each_entry(va, vmap_list, list) { + /* flush range by one by one 'invlpg' */ + for (addr = va->va_start; addr < va->va_end; addr += PAGE_SIZE) + flush_tlb_one_kernel(addr); + } +} + +void flush_tlb_kernel_vas(struct list_head *vmap_list, unsigned long num_entries) +{ + if (num_entries > tlb_single_page_flush_ceiling) + on_each_cpu(do_flush_tlb_all, NULL, 1); + else + on_each_cpu(do_flush_tlb_vas, vmap_list, 1); +} + /* * This can be used from process context to figure out what the value of * CR3 is without needing to do a (slow) __read_cr3(). --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -295,4 +295,6 @@ bool vmalloc_dump_obj(void *object); static inline bool vmalloc_dump_obj(void *object) { return false; } #endif +void flush_tlb_kernel_vas(struct list_head *list, unsigned long num_entries); + #endif /* _LINUX_VMALLOC_H */ --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1724,7 +1724,8 @@ static void purge_fragmented_blocks_allc */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { - unsigned long resched_threshold; + unsigned long resched_threshold, num_entries = 0, num_alias_entries = 0; + struct vmap_area alias_va = { .va_start = start, .va_end = end }; unsigned int num_purged_areas = 0; struct list_head local_purge_list; struct vmap_area *va, *n_va; @@ -1736,18 +1737,29 @@ static bool __purge_vmap_area_lazy(unsig list_replace_init(&purge_vmap_area_list, &local_purge_list); spin_unlock(&purge_vmap_area_lock); - if (unlikely(list_empty(&local_purge_list))) - goto out; + start = min(start, list_first_entry(&local_purge_list, struct vmap_area, list)->va_start); + end = max(end, list_last_entry(&local_purge_list, struct vmap_area, list)->va_end); - start = min(start, - list_first_entry(&local_purge_list, - struct vmap_area, list)->va_start); - - end = max(end, - list_last_entry(&local_purge_list, - struct vmap_area, list)->va_end); + if (IS_ENABLED(CONFIG_HAVE_FLUSH_TLB_KERNEL_VAS)) { + list_for_each_entry(va, &local_purge_list, list) + num_entries += (va->va_end - va->va_start) >> PAGE_SHIFT; + + if (unlikely(!num_entries)) + goto out; + + if (alias_va.va_end > alias_va.va_start) { + num_alias_entries = (alias_va.va_end - alias_va.va_start) >> PAGE_SHIFT; + list_add(&alias_va.list, &local_purge_list); + } + + flush_tlb_kernel_vas(&local_purge_list, num_entries + num_alias_entries); + + if (num_alias_entries) + list_del(&alias_va.list); + } else { + flush_tlb_kernel_range(start, end); + } - flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; spin_lock(&free_vmap_area_lock);