Hello, David. On 02/23/2017 07:09 PM, David Miller wrote: > From: Bob Picco <bob.picco@xxxxxxxxxx> > Date: Wed, 1 Feb 2017 07:38:20 -0500 > >> The program was built as: >> >> gcc -Wall -m64 -o test_with_mmap test_with_mmap.c -lrt -lm > > Anything meauring performance should be built with optimizations > enabled, at least -O2. The test program was used mainly to show that the patches actually increase the TSB size. We were not interested in absolute values reported by the program, our interest was in the relative growth of the numbers. Therefore, we did not use optimization options. For example, if we look at Bob's cover letter at scenario "1. T7-2 LDOM. 4 vCPU, 32GB RAM", we will see these numbers: +-----------+--------+--------+--------+ |region_size|no patch| patch | S11.3 | +-----------+--------+--------+--------+ ... +-----------+--------+--------+--------+ |512m | 1741.04| 1736.21| 1840.40| +-----------+--------+--------+--------+ |576m |10885.34| 1958.27| 2068.41| +-----------+--------+--------+--------+ |640m |20029.18| 2185.42| 2321.79| +-----------+--------+--------+--------+ ... In theory, the potential TSB size to effectively hold a region > 512m should be > 1m. So for the not patched kernel we should expect a relative performance drop when working (page touching) with areas > 512m. The above numbers illustrate it, i.e. numbers grow linearly up to 512m, but once we step over 512m we observe a very significant (exponential) increase of numbers. As for the patched kernel and S11.3. Their TSBs are greater, so their numbers increase almost linearly. > > Also, this test program, if you're giving so much detailed information > on how to use it and run it and what it's results mean, absolutely must > be included in this series somehow. > > We have a testing subdirectory, place it there and add it to the test > build Makefile rules. tools/testing/selftests/ You can create a > sparc subdirectory there. > test_with_mmap.c is not a self contained test. It requires know machines conditions and significant effort on the testers part before drawing a conclusion. It's just a tool we used for our experiment, and it's not like other kernel tests in tools/testing/selftest. I don't think that anyone may get a benefit if we put it there. In an attempt to support this position and share the code I'm attaching the *.c file and the README files to this message. Could you, please, have a look at them? And having said above, will it work if we leave the test program's source in the mailing list? Thank you.
#include <errno.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <time.h> #include <unistd.h> #include <math.h> #define VERSION_NUM 8 #define NUM_RW_ITERS 5 #define NUM_TESTS (3 + 2*NUM_RW_ITERS) static long get_time_nsec(void); static int memory_fill(char *addr, size_t size, size_t blk_size, long pattern); static int verify_memory_data(char *addr, size_t size, size_t blk_size, long pattern); static int parse_optarg(char *optarg, unsigned long *value); static void help(char *options); int main(int argc, char *argv[]) { unsigned long region_size = 0; unsigned long blk_size = 0; unsigned long num_iters = 0; unsigned long iter; int opt; double **time; double *mean, *cv; double mean_prev; char *addr; long s, e; int i, rc; long val; int mmap_flags; #ifdef linux char valid_options[] = "i:b:r:h"; int use_huge_pages = 0; #else char valid_options[] = "i:b:r:p:"; struct memcntl_mha mcmd; unsigned long page_size = 0; #endif while ((opt = getopt(argc, argv, valid_options)) != -1) { switch (opt) { case 'i': if (parse_optarg(optarg, &num_iters)) { printf("-i: invalid format\n"); return 1; } break; case 'b': if (parse_optarg(optarg, &blk_size)) { printf("-b: invalid format\n"); return 1; } break; case 'r': if (parse_optarg(optarg, ®ion_size)) { printf("-r: invalid format\n"); return 1; } break; #ifdef linux case 'h': use_huge_pages = 1; break; #else case 'p': if (parse_optarg(optarg, &page_size)) { printf("-p: invalid format\n"); return 1; } break; #endif } } if (!num_iters || !blk_size || !region_size) { printf("Please, specify the number of iterations, region size, block size\n"); help(valid_options); return 1; } #ifndef linux if (!page_size) { printf("Please, specify the page size\n"); help(valid_options); return 1; } #endif printf("region - %0.1f(GB), block size - %ld bytes, number of iterations - %ld\n", (double)(region_size)/(1024 * 1024 * 1024), blk_size, num_iters); time = malloc(sizeof(double *) * NUM_TESTS); if (time == NULL) { perror("malloc"); return 1; } for (i = 0; i < NUM_TESTS; i++) { time[i] = malloc(sizeof(double) * num_iters); if (time[i] == NULL) { perror("malloc(time)"); return 1; } } mean = calloc(NUM_TESTS, sizeof(double)); if (mean == NULL) { perror("calloc(mean)"); return 1; } cv = calloc(NUM_TESTS, sizeof(double)); if (cv == NULL) { perror("calloc(cv)"); return 1; } #ifdef linux mmap_flags = MAP_ANONYMOUS | MAP_SHARED; if (use_huge_pages) { printf("The region will be allocated using Huge Pages\n"); mmap_flags |= MAP_HUGETLB; } #else mmap_flags = MAP_ANON | MAP_SHARED; printf("The region will be allocated using %ld-byte pages\n", page_size); #endif for (iter = 0; iter < num_iters; iter++) { s = get_time_nsec(); addr = mmap(NULL, region_size, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); e = get_time_nsec(); if (addr == MAP_FAILED) { perror("mmap"); return 1; } time[0][iter] = (e - s) / 1000.0; #ifdef linux time[1][iter] = 0; #else mcmd.mha_cmd = MHA_MAPSIZE_VA; mcmd.mha_flags = 0; mcmd.mha_pagesize = page_size; s = get_time_nsec(); rc = memcntl(addr, region_size, MC_HAT_ADVISE, (caddr_t)&mcmd, 0, 0); e = get_time_nsec(); if (rc) { perror("memcntl"); return 1; } time[1][iter] = (e - s) / 1000.0; #endif for (i = 0; i < NUM_RW_ITERS; i++) { val = 0x123456789abcdef0 + i; s = get_time_nsec(); memory_fill(addr, region_size, blk_size, val); e = get_time_nsec(); time[2*i + 2][iter] = (e - s) / 1000.0; s = get_time_nsec(); rc = verify_memory_data(addr, region_size, blk_size, val); e = get_time_nsec(); if (rc) return 1; time[2*i + 3][iter] = (e - s) / 1000.0; } s = get_time_nsec(); rc = munmap(addr, region_size); e = get_time_nsec(); if (rc) { perror("munmap"); return 1; } time[NUM_TESTS - 1][iter] = (e - s) / 1000.0; } /* * Calculating the mean using recurrence formula: * M_k = M_k-1 + (x_k - M_k-1) / k * and variance: * V_k = V_k-1 + (x_k - M_k-1)*(x_k - M_k) * sigma_k^2 = V_k/(k - 1) for k > 1 * * CV = sigma / mean */ for (i = 0; i < NUM_TESTS; i++) { mean[i] = time[i][0]; cv[i] = 0; for (iter = 1; iter < num_iters; iter++) { mean_prev = mean[i]; mean[i] = mean[i] + (time[i][iter] - mean[i])/(iter + 1); cv[i] = cv[i] + (time[i][iter] - mean_prev)*(time[i][iter] - mean[i]); } if (num_iters >= 2) { cv[i] = sqrt(cv[i]/(num_iters - 1)); cv[i] /= mean[i] / 100.0; } } printf("%8s%20s%20s\n", "test", "mean (us)", "cv (%)"); printf("mmap %20.2f%20.2f\n", mean[0], cv[0]); printf("memcntl %20.2f%20.2f\n", mean[1], cv[1]); for (i = 0; i < NUM_RW_ITERS; i++) { printf("write_%d %20.2f%20.2f\n", i, mean[2*i + 2], cv[2*i + 2]); printf("read_%d %20.2f%20.2f\n", i, mean[2*i + 3], cv[2*i + 3]); } printf("munmap %20.2f%20.2f\n", mean[NUM_TESTS - 1], cv[NUM_TESTS - 1]); return 0; } static void help(char *options) { while (*options) { switch (*options) { case 'i': printf("-i Number of iterations\n"); break; case 'b': printf("-b <block size>[kmg]\n"); break; case 'r': printf("-r <region size>[kmg]\n"); break; case 'h': printf("-h Allocate the region using Huge Pages\n"); break; case 'p': printf("-p <page size>[kmg] Page size used for allocating the region\n"); break; } options++; } } static int parse_optarg(char *optarg, unsigned long *value) { char *s, *e; int base; int ret = -1; s = strstr(optarg, "0x"); if (s != NULL) { base = 16; s += 2; } else { base = 10; s = optarg; } errno = 0; *value = strtoul(s, &e, base); /* conversion error */ if (errno) goto out; /* no conversion at all */ if (s == e) goto out; if (strlen(e) == 0) { ret = 0; goto out; } /* * we allow only one character at the end, * which is expected to be a multiplier */ if (strlen(e) > 1) goto out; switch (*e) { case 'g': case 'G': *value *= 1024 * 1024 * 1024UL; break; case 'm': case 'M': *value *= 1024 * 1024UL; break; case 'k': case 'K': *value *= 1024UL; break; default: /* invalid modifier */ ret = 1; goto out; } ret = 0; out: return ret; } static long get_time_nsec(void) { struct timespec time; int errsv = errno; clock_gettime(CLOCK_MONOTONIC, &time); errno = errsv; return (time.tv_sec * 1e9 + time.tv_nsec); } static int memory_fill(char *addr, size_t size, size_t blk_size, long pattern) { long i; for (i = 0; i < (size / blk_size); i++) { *((long *)addr) = pattern; addr += blk_size; } return 0; } static int verify_memory_data(char *addr, size_t size, size_t blk_size, long pattern) { long i; for (i = 0; i < (size / blk_size); i++) { if ((*(long *)addr) != pattern) { printf("verify_memory_data: DATA ERROR at addr = %p data = %lx, " "expected data = %lx\n", addr, *((long *)addr), pattern); return -1; } addr += blk_size; } return 0; }
This is a test case for bug: BUG 20510832 - TEST_WITH_MMAP: LOW READ/WRITE PERFORMANCE IF COMPARE TO SOLARIS It works this way: 1) Allocates a memory region using mmap(MAP_ANONYMOUS) 2) Tries to write/read to this region using a specified block size 3) Deallocates this region using munmap() 4) Measures the time required for each of the above steps The initial idea is to use this test case to verify whether the TSB size on Linux is less than on Solaris. To check that you need to run: on Linux: ./test_with_mmap -i 10 -r 16g -b 8k on Solaris: ./test_with_mmap -i 10 -r 16g -b 8k -p 8k and compare the results. They should be more-or-less the same. We may also use this test case to track regressions between kernel versions. On Linux, by default, the default page size is used for allocating the region. However, you may allocate it with Huge Pages (-h). On Solaris the page size for the region is selected by (-p).