From: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> I think the flush_tlb_mm_range() code that tries to tune the flush sizes based on the CPU needs to get ripped out for several reasons: 1. It is obviously buggy. It uses mm->total_vm to judge the task's footprint in the TLB. It should certainly be using some measure of RSS, *NOT* ->total_vm since only resident memory can populate the TLB. 2. Haswell, and several other CPUs are missing from the intel_tlb_flushall_shift_set() function. Thus, it has been demonstrated to bitrot quickly in practice. 3. It is plain wrong in my vm: [ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0 [ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0 [ 0.037444] tlb_flushall_shift: 6 Which leads to it to never use invlpg. 4. The assumptions about TLB refill costs are wrong: http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@xxxxxxxxx (more on this in later patches) 5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59 I believe the sample times were too short. Running the benchmark in a loop yields times that vary quite a bit. Note that this leaves us with a static ceiling of 1 page. This is a conservative, dumb setting, and will be revised in a later patch. Signed-off-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> --- b/arch/x86/include/asm/processor.h | 1 b/arch/x86/kernel/cpu/amd.c | 7 -- b/arch/x86/kernel/cpu/common.c | 13 ----- b/arch/x86/kernel/cpu/intel.c | 26 ---------- b/arch/x86/mm/tlb.c | 91 ++++++------------------------------- 5 files changed, 19 insertions(+), 119 deletions(-) diff -puN arch/x86/include/asm/processor.h~x8x-mm-rip-out-complicated-tlb-flushing arch/x86/include/asm/processor.h --- a/arch/x86/include/asm/processor.h~x8x-mm-rip-out-complicated-tlb-flushing 2014-04-21 11:10:34.813835861 -0700 +++ b/arch/x86/include/asm/processor.h 2014-04-21 11:10:34.823836313 -0700 @@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_I extern u16 __read_mostly tlb_lld_2m[NR_INFO]; extern u16 __read_mostly tlb_lld_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_1g[NR_INFO]; -extern s8 __read_mostly tlb_flushall_shift; /* * CPU type and hardware bug flags. Kept separately for each CPU. diff -puN arch/x86/kernel/cpu/amd.c~x8x-mm-rip-out-complicated-tlb-flushing arch/x86/kernel/cpu/amd.c --- a/arch/x86/kernel/cpu/amd.c~x8x-mm-rip-out-complicated-tlb-flushing 2014-04-21 11:10:34.814835907 -0700 +++ b/arch/x86/kernel/cpu/amd.c 2014-04-21 11:10:34.824836358 -0700 @@ -741,11 +741,6 @@ static unsigned int amd_size_cache(struc } #endif -static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) -{ - tlb_flushall_shift = 6; -} - static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) { u32 ebx, eax, ecx, edx; @@ -793,8 +788,6 @@ static void cpu_detect_tlb_amd(struct cp tlb_lli_2m[ENTRIES] = eax & mask; tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; - - cpu_set_tlb_flushall_shift(c); } static const struct cpu_dev amd_cpu_dev = { diff -puN arch/x86/kernel/cpu/common.c~x8x-mm-rip-out-complicated-tlb-flushing arch/x86/kernel/cpu/common.c --- a/arch/x86/kernel/cpu/common.c~x8x-mm-rip-out-complicated-tlb-flushing 2014-04-21 11:10:34.816835998 -0700 +++ b/arch/x86/kernel/cpu/common.c 2014-04-21 11:10:34.825836403 -0700 @@ -479,26 +479,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO]; u16 __read_mostly tlb_lld_4m[NR_INFO]; u16 __read_mostly tlb_lld_1g[NR_INFO]; -/* - * tlb_flushall_shift shows the balance point in replacing cr3 write - * with multiple 'invlpg'. It will do this replacement when - * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. - * If tlb_flushall_shift is -1, means the replacement will be disabled. - */ -s8 __read_mostly tlb_flushall_shift = -1; - void cpu_detect_tlb(struct cpuinfo_x86 *c) { if (this_cpu->c_detect_tlb) this_cpu->c_detect_tlb(c); printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" - "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" - "tlb_flushall_shift: %d\n", + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], - tlb_lld_1g[ENTRIES], tlb_flushall_shift); + tlb_lld_1g[ENTRIES]); } void detect_ht(struct cpuinfo_x86 *c) diff -puN arch/x86/kernel/cpu/intel.c~x8x-mm-rip-out-complicated-tlb-flushing arch/x86/kernel/cpu/intel.c --- a/arch/x86/kernel/cpu/intel.c~x8x-mm-rip-out-complicated-tlb-flushing 2014-04-21 11:10:34.818836088 -0700 +++ b/arch/x86/kernel/cpu/intel.c 2014-04-21 11:10:34.825836403 -0700 @@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsig } } -static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) -{ - switch ((c->x86 << 8) + c->x86_model) { - case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 0x61d: /* six-core 45 nm xeon "Dunnington" */ - tlb_flushall_shift = -1; - break; - case 0x63a: /* Ivybridge */ - tlb_flushall_shift = 2; - break; - case 0x61a: /* 45 nm nehalem, "Bloomfield" */ - case 0x61e: /* 45 nm nehalem, "Lynnfield" */ - case 0x625: /* 32 nm nehalem, "Clarkdale" */ - case 0x62c: /* 32 nm nehalem, "Gulftown" */ - case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ - case 0x62f: /* 32 nm Xeon E7 */ - case 0x62a: /* SandyBridge */ - case 0x62d: /* SandyBridge, "Romely-EP" */ - default: - tlb_flushall_shift = 6; - } -} - static void intel_detect_tlb(struct cpuinfo_x86 *c) { int i, j, n; @@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpui for (j = 1 ; j < 16 ; j++) intel_tlb_lookup(desc[j]); } - intel_tlb_flushall_shift_set(c); } static const struct cpu_dev intel_cpu_dev = { diff -puN arch/x86/mm/tlb.c~x8x-mm-rip-out-complicated-tlb-flushing arch/x86/mm/tlb.c --- a/arch/x86/mm/tlb.c~x8x-mm-rip-out-complicated-tlb-flushing 2014-04-21 11:10:34.820836178 -0700 +++ b/arch/x86/mm/tlb.c 2014-04-21 11:10:34.826836449 -0700 @@ -158,13 +158,22 @@ void flush_tlb_current_task(void) preempt_enable(); } +/* + * See Documentation/x86/tlb.txt for details. We choose 33 + * because it is large enough to cover the vast majority (at + * least 95%) of allocations, and is small enough that we are + * confident it will not cause too much overhead. Each single + * flush is about 100 cycles, so this caps the maximum overhead + * at _about_ 3,000 cycles. + */ +/* in units of pages */ +unsigned long tlb_single_page_flush_ceiling = 1; + void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { int need_flush_others_all = 1; unsigned long addr; - unsigned act_entries, tlb_entries = 0; - unsigned long nr_base_pages; preempt_disable(); if (current->active_mm != mm) @@ -175,25 +184,12 @@ void flush_tlb_mm_range(struct mm_struct goto out; } - if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 - || vmflag & VM_HUGETLB) { + if (end == TLB_FLUSH_ALL || vmflag & VM_HUGETLB) { local_flush_tlb(); goto out; } - /* In modern CPU, last level tlb used for both data/ins */ - if (vmflag & VM_EXEC) - tlb_entries = tlb_lli_4k[ENTRIES]; - else - tlb_entries = tlb_lld_4k[ENTRIES]; - - /* Assume all of TLB entries was occupied by this task */ - act_entries = tlb_entries >> tlb_flushall_shift; - act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; - nr_base_pages = (end - start) >> PAGE_SHIFT; - - /* tlb_flushall_shift is on balance point, details in commit log */ - if (nr_base_pages > act_entries) { + if ((end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { @@ -259,68 +255,15 @@ static void do_kernel_range_flush(void * void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - unsigned act_entries; - struct flush_tlb_info info; - - /* In modern CPU, last level tlb used for both data/ins */ - act_entries = tlb_lld_4k[ENTRIES]; /* Balance as user space task's flush, a bit conservative */ - if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || - (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) - + if (end == TLB_FLUSH_ALL || + (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { on_each_cpu(do_flush_tlb_all, NULL, 1); - else { + } else { + struct flush_tlb_info info; info.flush_start = start; info.flush_end = end; on_each_cpu(do_kernel_range_flush, &info, 1); } } - -#ifdef CONFIG_DEBUG_TLBFLUSH -static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - char buf[32]; - unsigned int len; - - len = sprintf(buf, "%hd\n", tlb_flushall_shift); - return simple_read_from_buffer(user_buf, count, ppos, buf, len); -} - -static ssize_t tlbflush_write_file(struct file *file, - const char __user *user_buf, size_t count, loff_t *ppos) -{ - char buf[32]; - ssize_t len; - s8 shift; - - len = min(count, sizeof(buf) - 1); - if (copy_from_user(buf, user_buf, len)) - return -EFAULT; - - buf[len] = '\0'; - if (kstrtos8(buf, 0, &shift)) - return -EINVAL; - - if (shift < -1 || shift >= BITS_PER_LONG) - return -EINVAL; - - tlb_flushall_shift = shift; - return count; -} - -static const struct file_operations fops_tlbflush = { - .read = tlbflush_read_file, - .write = tlbflush_write_file, - .llseek = default_llseek, -}; - -static int __init create_tlb_flushall_shift(void) -{ - debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, - arch_debugfs_dir, NULL, &fops_tlbflush); - return 0; -} -late_initcall(create_tlb_flushall_shift); -#endif _ -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>