On 06/14/2012 09:26 AM, Alex Shi wrote: > On 06/14/2012 09:10 AM, Alex Shi wrote: > >> On 06/13/2012 10:56 PM, Andi Kleen wrote: >> >>> On Tue, Jun 12, 2012 at 05:06:45PM +0800, Alex Shi wrote: >>>> This patch do flush_tlb_kernel_range by 'invlpg'. The performance pay >>>> and gain was analysed in my patch (x86/flush_tlb: try flush_tlb_single >>>> one by one in flush_tlb_range). Now we move this logical into kernel >>>> part. The pay is multiple 'invlpg' execution cost, that is same. but >>>> the gain(cost reducing of TLB entries refilling) is absolutely >>>> increased. >>> >>> The subtle point is whether INVLPG flushes global pages or not. >>> After some digging I found a sentence in the SDM that says it does. >>> So it may be safe. >> >> >> Many thanks for your time! >> >>> >>> What does it improve? >> >> I just write a rough kernel modules that alloc some page arrays in kernel and then map to vaddr by 'vmap'. Then my macro benchmark inject a 'unmap_kernel_range' request from a sysfs interface, and doing random memory access in user level during the time. On my NHM EP 2P * 4 Cores * HT. Without this patch, the memory access with 4 threads is ~12ns/time. With this patch, the memory access with 4 threads is ~9ns/time. With threads number increasing the benefit becomes small and nearly disappeared after thread number up to 256. But no any regression. The rough user macro-benchmark and kernel module is here: --- kernel module-- #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/gfp.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/uaccess.h> #include <linux/sysfs.h> #include <linux/hrtimer.h> #include <linux/device.h> #include <linux/cpu.h> MODULE_LICENSE("Dual BSD/GPL"); /* * $cat Makefile * obj-m := modvmalloc.o * * compile command: * #cd linux; make /home/alexs/exec/modules/modvmalloc.ko */ #define NR_PAGES (4) #define NR_BLOCKS (1024) struct block { struct page ** page_array; void *vaddr; int page_count; }; struct block *block; static int blocks = NR_BLOCKS; module_param(blocks, uint, 0400); MODULE_PARM_DESC(blocks, "map unmap blocks number "); static struct page **relay_alloc_page_array(unsigned int nr_pages) { const size_t pa_size = NR_PAGES * sizeof(struct page *); if (pa_size > PAGE_SIZE) return vzalloc(pa_size); return kzalloc(pa_size, GFP_KERNEL); } static void relay_free_page_array(struct page **array) { if (is_vmalloc_addr(array)) vfree(array); else kfree(array); } static void vmap_unmap(void) { //purge_vmap_area_lazy(); //vm_unmap_aliases(); int i; for (i=0; i< blocks; i++) unmap_kernel_range((unsigned long)(block->vaddr), NR_PAGES*PAGE_SIZE); } // --------------- long vmap_num = 0; static ssize_t __vmap_num_store(const char *buf, size_t count, int smt) { long factor = 0; long i; unsigned long start, stop; if (sscanf(buf, "%ld", &factor) != 1) return -EINVAL; vmap_num = factor; start = ktime_to_ns(ktime_get()); vmap_unmap(); stop = ktime_to_ns(ktime_get()); i = blocks; printk(KERN_ERR "vunmap %ld times cost %ld ns/time\n", i, (stop - start)/i); return count; } static ssize_t vmap_num_show(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%ld\n", vmap_num); } static ssize_t vmap_num_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { return __vmap_num_store(buf, count, 0); } DEVICE_ATTR(vmap_num, 0644, vmap_num_show, vmap_num_store); int create_sysfs_vmap_num(struct device *dev) { return device_create_file(dev, &dev_attr_vmap_num); } static int mapunmap_init(void){ long i,j,k; create_sysfs_vmap_num(cpu_subsys.dev_root); block = kmalloc(sizeof(struct block)*blocks, GFP_KERNEL); for (k=0; k< blocks; k++) { block[k].page_count = 0; block[k].page_array = relay_alloc_page_array(NR_PAGES); if (!block[k].page_array) return -1; for (i = 0; i < NR_PAGES; i++) { block[k].page_array[i] = alloc_page(GFP_KERNEL); if (unlikely(!block[k].page_array[i])) { printk(KERN_ERR "\talloc page error \n"); goto depopulate; } } if (i!=NR_PAGES) goto depopulate; block[k].page_count = i; block[k].vaddr = vmap(block[k].page_array, NR_PAGES, VM_MAP, PAGE_KERNEL); if (!(block[k].vaddr)) { printk(KERN_ERR "\t\t vmap error !\n"); goto depopulate; } } printk(KERN_INFO "vmalloc module init OK \n"); return 0; depopulate: for (i=0; i< k; i++) if (block[i].page_count !=0) { for (j = 0; j < block[i].page_count; j++) __free_page((block[j].page_array[j])); relay_free_page_array(block[j].page_array); } printk(KERN_INFO "vmalloc module init fail\n"); return -1; } static void mapunmap_exit(void){ long i, j; printk(KERN_INFO "bye! this is test module\n"); device_remove_file(cpu_subsys.dev_root, &dev_attr_vmap_num); for (i=0; i< blocks; i++) if (block[i].page_count !=0) { for (j = 0; j < block[i].page_count; j++) __free_page((block[j].page_array[j])); relay_free_page_array(block[j].page_array); } } module_init(mapunmap_init); module_exit(mapunmap_exit); --- benchmark --- /* maccess.c This is a macrobenchmark for TLB flush range testing. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Copyright (C) Intel 2012 Coypright Alex Shi alex.shi@xxxxxxxxx gcc -o maccess maccess.c -lrt -lpthread -O2 #perf stat -e r881,r882,r884 -e r801,r802,r810,r820,r840,r880,r807 -e rc01 -e r4901,r4902,r4910,r4920,r4940,r4980 -e r5f01 -e rbd01,rdb20 -e r4f02 -e r8004,r8201,r8501,r8502,r8504,r8510,r8520,r8540,r8580 -e rae01,rc820,rc102,rc900 -e r8600 -e rcb10 ./maccess */ #define _GNU_SOURCE #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <sys/mman.h> #include <time.h> #include <sys/types.h> #include <pthread.h> #define FILE_SIZE (1024*1024*1024) #define PAGE_SIZE (4096) #define HPAGE_SIZE (4096*512) #ifndef MAP_HUGETLB #define MAP_HUGETLB 0x40000 #endif long getnsec(clockid_t clockid) { struct timespec ts; if (clock_gettime(clockid, &ts) == -1) perror("clock_gettime failed"); return (long) ts.tv_sec * 1000000000 + (long) ts.tv_nsec; } //data for threads struct data{ int pagenum; void *startaddr; int rw; int loop; }; volatile int * threadstart; //thread for memory accessing void *accessmm(void *data){ struct data *d = data; long *actimes; char x; int i, k; int randn[PAGE_SIZE]; for (i=0;i<PAGE_SIZE; i++) randn[i] = rand(); actimes = malloc(sizeof(long)); while (*threadstart == 0 ) usleep(1); if (d->rw == 0) for (*actimes=0; *threadstart == 1; (*actimes)++) for (k=0; k < d->pagenum; k++) x = *(volatile char *)(d->startaddr + randn[k]%FILE_SIZE); else for (*actimes=0; *threadstart == 1; (*actimes)++) for (k=0; k < d->pagenum; k++) *(char *)(d->startaddr + randn[k]%FILE_SIZE) = 1; return actimes; } int main(int argc, char *argv[]) { static char optstr[] = "p:w:ht:s:"; int s = 1; /* */ int p = 512; /* default accessed page number, after maccess */ int er = 0, rw = 0, h = 0, t = 2; /* d: debug; h: use huge page; t thread number */ int pagesize = PAGE_SIZE; /*default for regular page */ volatile char x; long protindex = 0; int i, j, k, c; void *m1, *startaddr; unsigned long *startaddr2[1024*512]; volatile void *tempaddr; clockid_t clockid = CLOCK_MONOTONIC; unsigned long start, stop, mptime, actime; int randn[PAGE_SIZE]; pthread_t pid[1024]; void * res; struct data data; char command[1024]; for (i=0;i<PAGE_SIZE; i++) randn[i] = rand(); while ((c = getopt(argc, argv, optstr)) != EOF) switch (c) { case 's': s = atoi(optarg); break; case 'p': p = atoi(optarg); break; case 'h': h = 1; break; case 'w': rw = atoi(optarg); break; case 't': t = atoi(optarg); break; case '?': er = 1; break; } if (er) { printf("usage: %s %s\n", argv[0], optstr); exit(1); } printf("pid is %d, thread number %d active %d seconds, access page num %d\n", getpid(), t, s, p); if (h == 0){ startaddr = mmap(0, FILE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); pagesize = PAGE_SIZE; } else { startaddr = mmap(0, FILE_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB, -1, 0); pagesize = HPAGE_SIZE; } start = getnsec(clockid); //access whole memory, will generate many page faults for (tempaddr = startaddr; tempaddr < startaddr + FILE_SIZE; tempaddr += pagesize) memset((char *)tempaddr, 0, 1); stop = getnsec(clockid); threadstart = malloc(sizeof(int)); *threadstart = 0; data.pagenum = p; data.startaddr = startaddr; data.rw = rw; for (i=0; i< t; i++) if(pthread_create(&pid[i], NULL, accessmm, &data)) perror("pthread create"); //wait for randn[] filling. sleep(1); mptime = actime = 0; sprintf(command, "sudo sh -c 'echo %d > /sys/devices/system/cpu/vmap_num'", s); printf("%s\n", command); start = getnsec(clockid); //kick threads, let them running. *threadstart = 1; system(command); *threadstart = 0; stop = getnsec(clockid); mptime += stop - start; //get threads' result. for (i=0; i< t; i++) { if (pthread_join(pid[i], &res)) perror("pthread_join"); actime += *(long*)res; } end: printf("maccess %ld ms, memory access %ld times/thread/ms, cost %ldns/time\n", mptime/1000000, actime*p*1000000/t/mptime, mptime*t/(actime*p)); exit(0); } > >> >>> -Andi >> >> > > -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html